Skip to content

Commit fc1282a

Browse files
author
Kenneth Daily
authored
Merge pull request #38 from Sage-Bionetworks/split-query-types
Split query types
2 parents 22a1a9b + 9078f45 commit fc1282a

File tree

5 files changed

+88
-143
lines changed

5 files changed

+88
-143
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Encoding: UTF-8
99
LazyData: true
1010
URL: https://github.com/Sage-Bionetworks/synapseusagereports
1111
BugReports: https://github.com/Sage-Bionetworks/synapseusagereports/issues
12-
RoxygenNote: 6.1.0
12+
RoxygenNote: 6.1.1
1313
Imports:
1414
dplyr,
1515
RMySQL,

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export(firstMonthToVisit)
77
export(getData)
88
export(getQueryUserProfiles)
99
export(getTeamMemberDF)
10+
export(get_query_template_string)
1011
export(makeDateBreaks)
1112
export(makeDateBreaksStartEnd)
1213
export(multiMonthVisits)

R/lib.R

Lines changed: 66 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -33,59 +33,86 @@ render_report <- function(project_id, team_order, data_file, reportType="report"
3333

3434
}
3535

36+
query_template_strings <- list("pageview" = 'select ENTITY_ID,CONVERT(AR.TIMESTAMP, CHAR) AS TIMESTAMP,DATE,USER_ID,NODE_TYPE,N.NAME from ACCESS_RECORD AR, PROCESSED_ACCESS_RECORD PAR, NODE_SNAPSHOT N, PROJECT_STATS NODE where AR.RESPONSE_STATUS=200 AND AR.TIMESTAMP > unix_timestamp("%s")*1000 AND AR.TIMESTAMP < unix_timestamp("%s")*1000 AND AR.SESSION_ID = PAR.SESSION_ID and AR.TIMESTAMP = PAR.TIMESTAMP and PAR.ENTITY_ID = NODE.ID AND N.ID = NODE.ID and N.TIMESTAMP = NODE.TIMESTAMP and CLIENT IN ("WEB", "UNKNOWN") AND (PAR.NORMALIZED_METHOD_SIGNATURE IN ("GET /entity/#/bundle", "GET /entity/#/version/#/bundle", "GET /entity/#/wiki2", "GET /entity/#/wiki2/#"));',
37+
"download" = 'select ENTITY_ID,CONVERT(AR.TIMESTAMP, CHAR) AS TIMESTAMP,DATE,USER_ID,NODE_TYPE,N.NAME from ACCESS_RECORD AR, PROCESSED_ACCESS_RECORD PAR, NODE_SNAPSHOT N, PROJECT_STATS NODE where AR.TIMESTAMP > unix_timestamp("%s")*1000 AND AR.TIMESTAMP < unix_timestamp("%s")*1000 and (AR.RESPONSE_STATUS IN (200, 307)) AND AR.SESSION_ID = PAR.SESSION_ID and AR.TIMESTAMP = PAR.TIMESTAMP and PAR.ENTITY_ID = NODE.ID and N.ID = NODE.ID AND N.TIMESTAMP = NODE.TIMESTAMP and (PAR.NORMALIZED_METHOD_SIGNATURE IN ("GET /entity/#/file", "GET /entity/#/version/#/file"));',
38+
"filedownloadrecord" = 'SELECT FDR.ASSOCIATION_OBJECT_ID AS ENTITY_ID, CONVERT(FDR.TIMESTAMP , CHAR) AS TIMESTAMP, DATE_FORMAT(from_unixtime(FDR.TIMESTAMP / 1000), "%%Y-%%m-%%d") AS DATE, FDR.USER_ID, N.NODE_TYPE, N.NAME FROM FILE_DOWNLOAD_RECORD FDR, NODE_SNAPSHOT N, PROJECT_STATS WHERE FDR.TIMESTAMP > unix_timestamp("%s")*1000 AND FDR.TIMESTAMP < unix_timestamp("%s")*1000 AND N.ID = PROJECT_STATS.ID AND PROJECT_STATS.ID = FDR.ASSOCIATION_OBJECT_ID AND FDR.ASSOCIATION_OBJECT_TYPE = "FileEntity" AND N.TIMESTAMP = PROJECT_STATS.TIMESTAMP;')
39+
40+
#' Get the SQL query template string.
41+
#'
42+
#' @param query_type The name of the SQL query to get.
43+
#'
44+
#' @return An SQL query string.
3645
#' @export
37-
report_data_query <- function(con, project_id, start_date, end_date) {
38-
39-
project_id <- gsub("syn", "", project_id)
46+
#'
47+
#' @examples
48+
get_query_template_string <- function(query_type) {
49+
if (!(query_type %in% c("download", "pageview", "filedownloadrecord"))) {
50+
stop("Not a valid query type.")
51+
}
4052

41-
qPageviewTemplate <- 'select ENTITY_ID,CONVERT(AR.TIMESTAMP, CHAR) AS TIMESTAMP,DATE,USER_ID,NODE_TYPE,N.NAME from ACCESS_RECORD AR, PROCESSED_ACCESS_RECORD PAR, NODE_SNAPSHOT N, PROJECT_STATS NODE where AR.RESPONSE_STATUS=200 AND AR.TIMESTAMP > unix_timestamp("%s")*1000 AND AR.TIMESTAMP < unix_timestamp("%s")*1000 AND AR.SESSION_ID = PAR.SESSION_ID and AR.TIMESTAMP = PAR.TIMESTAMP and PAR.ENTITY_ID = NODE.ID AND N.ID = NODE.ID and N.TIMESTAMP = NODE.TIMESTAMP and CLIENT IN ("WEB", "UNKNOWN") AND (PAR.NORMALIZED_METHOD_SIGNATURE IN ("GET /entity/#/bundle", "GET /entity/#/version/#/bundle", "GET /entity/#/wiki2", "GET /entity/#/wiki2/#"));'
53+
return(query_template_strings[[query_type]])
54+
}
4255

43-
qDownloadTemplate <- 'select ENTITY_ID,CONVERT(AR.TIMESTAMP, CHAR) AS TIMESTAMP,DATE,USER_ID,NODE_TYPE,N.NAME from ACCESS_RECORD AR, PROCESSED_ACCESS_RECORD PAR, NODE_SNAPSHOT N, PROJECT_STATS NODE where AR.TIMESTAMP > unix_timestamp("%s")*1000 AND AR.TIMESTAMP < unix_timestamp("%s")*1000 and (AR.RESPONSE_STATUS IN (200, 307)) AND AR.SESSION_ID = PAR.SESSION_ID and AR.TIMESTAMP = PAR.TIMESTAMP and PAR.ENTITY_ID = NODE.ID and N.ID = NODE.ID AND N.TIMESTAMP = NODE.TIMESTAMP and (PAR.NORMALIZED_METHOD_SIGNATURE IN ("GET /entity/#/file", "GET /entity/#/version/#/file"));'
56+
#' @export
57+
report_data_query <- function(con, project_id, query_type, start_date, end_date) {
4458

45-
qFDRTemplate <- 'SELECT FDR.ASSOCIATION_OBJECT_ID AS ENTITY_ID, CONVERT(FDR.TIMESTAMP , CHAR) AS TIMESTAMP, DATE_FORMAT(from_unixtime(FDR.TIMESTAMP / 1000), "%%Y-%%m-%%d") AS DATE, FDR.USER_ID, N.NODE_TYPE, N.NAME FROM FILE_DOWNLOAD_RECORD FDR, NODE_SNAPSHOT N, PROJECT_STATS WHERE FDR.TIMESTAMP > unix_timestamp("%s")*1000 AND FDR.TIMESTAMP < unix_timestamp("%s")*1000 AND N.ID = PROJECT_STATS.ID AND PROJECT_STATS.ID = FDR.ASSOCIATION_OBJECT_ID AND FDR.ASSOCIATION_OBJECT_TYPE = "FileEntity" AND N.TIMESTAMP = PROJECT_STATS.TIMESTAMP;'
59+
message(sprintf("Generating a %s report", query_type))
4660

61+
project_id <- gsub("syn", "", project_id)
4762

4863
timestampBreaksDf <- makeDateBreaksStartEnd(start_date, end_date) %>%
4964
filter(!is.na(start_date), !is.na(end_date))
5065

51-
queryDataPageviews <- getData(con=con,
52-
qTemplate=qPageviewTemplate,
53-
projectId=project_id,
54-
timestampBreaksDf=timestampBreaksDf)
66+
query_template <- get_query_template_string(query_type)
67+
68+
queryData <- getData(con = con,
69+
qTemplate = query_template,
70+
projectId = project_id,
71+
timestampBreaksDf = timestampBreaksDf)
5572

56-
queryDataPageviewsProcessed <- queryDataPageviews %>%
57-
dplyr::mutate(recordType='pageview') %>%
73+
queryDataProcessed <- queryData %>%
74+
dplyr::mutate(recordType = query_type) %>%
5875
processQuery()
5976

60-
queryDataDownloads <- getData(con=con,
61-
qTemplate=qDownloadTemplate,
62-
projectId=project_id,
63-
timestampBreaksDf=timestampBreaksDf)
6477

65-
queryDataDownloadsProcessed <- queryDataDownloads %>%
66-
dplyr::mutate(recordType='download') %>%
67-
processQuery()
78+
return(queryDataProcessed)
79+
}
6880

69-
queryDataFDR <- getData(con=con,
70-
qTemplate=qFDRTemplate,
71-
projectId=project_id,
72-
timestampBreaksDf=timestampBreaksDf)
7381

74-
queryDataFDRProcessed <- queryDataFDR %>%
75-
dplyr::mutate(recordType='download') %>%
76-
processQuery()
82+
report_data_query_all <- function(con, project_id, start_date, end_date) {
83+
84+
queryDataDownload <- report_data_query(con = con,
85+
project_id = project_id,
86+
query_type = "pageview",
87+
start_date = start_date,
88+
end_date = end_date)
89+
90+
queryDataPageview <- report_data_query(con = con,
91+
project_id = project_id,
92+
query_type = "download",
93+
start_date = start_date,
94+
end_date = end_date)
95+
7796

78-
queryData <- rbind(queryDataPageviewsProcessed,
79-
queryDataDownloadsProcessed,
80-
queryDataFDRProcessed)
97+
queryDataFDR <- report_data_query(con = con,
98+
project_id = project_id,
99+
query_type = "filedownloadrecord",
100+
start_date = start_date,
101+
end_date = end_date)
102+
103+
queryDataFDR$recordType <- "download"
104+
105+
queryData <- rbind(queryDataPageview,
106+
queryDataDownload,
107+
queryDataFDR)
81108

82109
return(queryData)
83110
}
84111

85-
86112
#' @export
87113
doQuery <- function(con, template, projectId, start_date, end_date) {
88114
q <- sprintf(template, start_date, end_date)
115+
message(sprintf("Query: %s", q))
89116
message(sprintf("Querying %s to %s", start_date, end_date))
90117

91118
res <- DBI::dbGetQuery(conn = con, statement=q)
@@ -115,10 +142,16 @@ processQuery <- function(data) {
115142

116143
#' @export
117144
getData <- function(con, qTemplate, projectId, timestampBreaksDf) {
118-
119-
q.create_temp <- "CREATE TEMPORARY TABLE PROJECT_STATS SELECT ID, MAX(TIMESTAMP) AS TIMESTAMP FROM NODE_SNAPSHOT WHERE PROJECT_ID = %s GROUP BY ID;"
145+
q.create_temp <- "CREATE TEMPORARY TABLE PROJECT_STATS (`TIMESTAMP` bigint(20) NOT NULL, `ID` bigint(20) NOT NULL, PRIMARY KEY (`ID`,`TIMESTAMP`)); "
120146
create <- DBI::dbSendQuery(conn=con,
121-
statement=sprintf(q.create_temp, projectId))
147+
statement=q.create_temp)
148+
message(sprintf("Created temporary table for entities in project %s", projectId))
149+
150+
q.insert_temp <- "INSERT INTO PROJECT_STATS (ID, TIMESTAMP) SELECT ID, MAX(TIMESTAMP) AS TIMESTAMP FROM NODE_SNAPSHOT WHERE PROJECT_ID = %s GROUP BY ID;"
151+
query_statement <- sprintf(q.insert_temp, projectId)
152+
insert <- DBI::dbSendQuery(conn=con,
153+
statement=query_statement)
154+
message(sprintf("Inserted rows into temporary table for entities in project %s", projectId))
122155

123156
res <- plyr::ddply(timestampBreaksDf, plyr::.(month, year),
124157
function (x) doQuery(con=con,

inst/scripts/report_data_query.R

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ option_list <- list(
1313
help = "Synapse Project ID.",
1414
dest = "project_id",
1515
metavar = "synapseid"),
16+
make_option(c("--query_type"), type = "character",
17+
help = "Type of query to perform. One of: download, pageview, filedownloadrecord, all",
18+
dest = "query_type"),
1619
make_option(c("--start_date"), type = "character",
1720
help = "Date at UTC (YYYY-MM-DD format)",
1821
dest = "start_date"),
@@ -22,7 +25,7 @@ option_list <- list(
2225
make_option(c("--config_file"), type = "character",
2326
help = "YAML database configuration file.",
2427
dest = "config_file",
25-
default="~/datawarehouse_config.yml")
28+
default = "~/datawarehouse_config.yml")
2629
)
2730

2831
opts <- parse_args(OptionParser(option_list = option_list))
@@ -38,15 +41,23 @@ if (n_months_from_today > 6) {
3841

3942
config <- yaml.load_file(opts$config_file)
4043

41-
con <- dbConnect(MySQL(),
42-
user = config$username,
43-
password = config$password,
44-
host = config$host,
45-
dbname=config$db)
44+
con <- RMySQL::dbConnect(RMySQL::MySQL(),
45+
user = config$username,
46+
password = config$password,
47+
host = config$host,
48+
dbname = config$db)
49+
50+
if (opts$query_type == "all") {
51+
queryData <- report_data_query_all(con, project_id = opts$project_id,
52+
start_date = start_date,
53+
end_date = end_date)
54+
} else {
55+
queryData <- report_data_query(con, project_id = opts$project_id,
56+
query_type = opts$query_type,
57+
start_date = start_date,
58+
end_date = end_date)
4659

47-
queryData <- report_data_query(con, project_id = opts$project_id,
48-
start_date = start_date,
49-
end_date = end_date)
60+
}
5061

5162
cat(readr::format_csv(queryData))
5263

inst/templates/report.Rmd

Lines changed: 0 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -95,106 +95,6 @@ queryData %>%
9595

9696
There are `r length(setdiff(unique(queryData$userName), c("anonymous")))` active registered Synapse users in this time period. Of these, `r multiMonthVisits(queryData) %>% nrow` users were active in the project in at least two different months.
9797

98-
## Project page views
99-
100-
Count of the number of page views of the main project Wiki (`r params$projectId`) per month.
101-
102-
```{r}
103-
projectPageViews <- queryData %>%
104-
filter(recordType=='pageview', NODE_TYPE=='project', id == int_project_id)
105-
106-
if (nrow(projectPageViews) > 0) {
107-
if (useTeamGrouping) {
108-
projectPageViewsCount <- projectPageViews %>%
109-
dplyr::filter(recordType == 'pageview') %>%
110-
dplyr::count(teamName, dateGrouping) %>%
111-
reshape2::dcast(teamName ~ dateGrouping, fun.aggregate = sum)
112-
} else {
113-
projectPageViewsCount <- projectPageViews %>%
114-
dplyr::filter(recordType == 'pageview') %>%
115-
dplyr::mutate(teamName='All') %>%
116-
dplyr::count(teamName, dateGrouping) %>%
117-
reshape2::dcast(teamName ~ dateGrouping)
118-
}
119-
120-
projectPageViewsCount %>% knitr::kable()
121-
}
122-
123-
```
124-
125-
#### Page views per month
126-
127-
Count of the page views of any page (Wiki or entity) in the entire project per month.
128-
129-
```{r loadpermonth, include=TRUE, eval=TRUE}
130-
if (useTeamGrouping) {
131-
dateGroupingCount <- queryData %>%
132-
dplyr::filter(recordType == 'pageview') %>%
133-
dplyr::count(teamName, dateGrouping) %>%
134-
reshape2::dcast(teamName ~ dateGrouping, fun.aggregate = sum)
135-
} else {
136-
dateGroupingCount <- queryData %>%
137-
dplyr::filter(recordType == 'pageview') %>%
138-
dplyr::mutate(teamName='All') %>%
139-
dplyr::count(teamName, dateGrouping) %>%
140-
reshape2::dcast(teamName ~ dateGrouping)
141-
}
142-
143-
dateGroupingCount %>% knitr::kable()
144-
```
145-
146-
#### Page views per day
147-
148-
Plot of the page views of any page (Wiki or entity) in the entire project per day.
149-
150-
```{r plotperday, fig.width=20, fig.height=6, include=TRUE, eval=TRUE}
151-
perdayCount <- countByDay(queryData %>% filter(recordType == 'pageview'),
152-
useTeamGrouping)
153-
154-
if (nrow(perdayCount) > 0) {
155-
plotByDay(perdayCount, useTeamGrouping)
156-
}
157-
```
158-
159-
#### Entity page views
160-
161-
The top 50 Files or Folders with at least 5 views.
162-
163-
```{r include=TRUE, eval=TRUE}
164-
### Data
165-
tmp <- queryData %>%
166-
dplyr::filter(recordType == 'pageview') %>%
167-
dplyr::count(id, NAME, NODE_TYPE) %>%
168-
dplyr::filter(n >= 5, !stringr::str_detect(id, "acl"))
169-
170-
if (nrow(tmp) > 0) {
171-
dataaccessCount1 <- queryData %>%
172-
dplyr::filter(recordType == 'pageview') %>%
173-
dplyr::filter(id %in% tmp$id) %>%
174-
dplyr::count(id, NAME, NODE_TYPE, dateGrouping) %>%
175-
dplyr::ungroup() %>%
176-
reshape2::dcast(id + NAME + NODE_TYPE ~ dateGrouping, value.var='n') %>%
177-
dplyr::mutate(name=sprintf("<a href='https://www.synapse.org/#!Synapse:syn%s' target='_blank'>%s</a>", id, NAME))
178-
# dplyr::mutate(name=sprintf("[%s](https://www.synapse.org/#!Synapse:syn%s)", NAME, id))
179-
180-
181-
dataaccessCount2 <- queryData %>%
182-
dplyr::filter(recordType == 'pageview') %>%
183-
dplyr::filter(id %in% tmp$id) %>%
184-
dplyr::count(id, NAME, NODE_TYPE) %>%
185-
dplyr::ungroup() %>%
186-
dplyr::arrange(dplyr::desc(n))
187-
188-
dataaccessCount <- dataaccessCount1 %>%
189-
left_join(dataaccessCount2, by=c("id", "NAME", "NODE_TYPE")) %>%
190-
dplyr::arrange(dplyr::desc(n)) %>%
191-
head(50) %>%
192-
dplyr::rename(total=n) %>%
193-
dplyr::select(name, everything(), total, -id, -NAME)
194-
195-
dataaccessCount %>% DT::datatable(options=list(pageLength=20), escape=1)
196-
}
197-
```
19898

19999
#### Entity downloads
200100

0 commit comments

Comments
 (0)