Commit d5437bb4 authored by Matija Obreza's avatar Matija Obreza

Using API v1

parent 5e5d8f98
Package: genesysr
Version: 0.9.4
Version: 1.0.0
Title: Genesys PGR Client
Description: Access data on plant genetic resources from genebanks around the world published on Genesys (<https://www.genesys-pgr.org>).
Your use of data is subject to terms and conditions available at <https://www.genesys-pgr.org/content/legal/terms>.
......@@ -17,5 +17,6 @@ RoxygenNote: 6.1.1
URL: https://gitlab.croptrust.org/genesys-pgr/genesysr
BugReports: https://gitlab.croptrust.org/genesys-pgr/genesysr/issues
Suggests: knitr,
rmarkdown
rmarkdown,
tidyverse
VignetteBuilder: knitr
# Generated by roxygen2: do not edit by hand
export(api1_url)
export(api_url)
export(api2_url)
export(authorization)
export(check_country)
export(check_landorsea)
......@@ -11,8 +11,11 @@ export(download_mcpd)
export(download_pdci)
export(fetch_accessions)
export(filter_DOI)
export(filter_GENUS)
export(filter_ORIGCTY)
export(filter_SAMPSTAT)
export(filter_SPECIES)
export(get_accessions)
export(mcpd_filter)
export(me)
export(print_setup)
......
......@@ -163,7 +163,7 @@ client_login <- function() {
#' @keywords internal
.api_call <- function(path, method = "get") {
.check_auth()
resp <- httr::GET(api_url(path), httr::add_headers(
resp <- httr::GET(path, httr::add_headers(
Authorization = .genesysEnv$Authorization
)
)
......@@ -177,16 +177,16 @@ client_login <- function() {
}
#' Get full Genesys API URL for a specific path
#' Get full Genesys API v1 URL for a specific path
#'
#' @param path relative path of the API endpoint (e.g. \code{/me})
#' @param path relative path of the API v1 endpoint (e.g. \code{/me})
#'
#' @return Absolute URL to an API call
#' @export
#'
#' @examples
#' api_url("/me")
api_url <- function(path) {
#' api1_url("/me")
api1_url <- function(path) {
paste0(.genesysEnv$server, "/api/v1", path)
}
......@@ -199,15 +199,15 @@ api_url <- function(path) {
#' @export
#'
#' @examples
#' api1_url("/me")
api1_url <- function(path) {
paste0(.genesysEnv$server, "/api/v1", path)
#' api2_url("/me")
api2_url <- function(path) {
paste0(.genesysEnv$server, "/api/v2", path)
}
#' @keywords internal
.get <- function(path, query = NULL) {
.check_auth()
resp <- httr::GET(api_url(path), query = query, httr::add_headers(
resp <- httr::GET(path, query = query, httr::add_headers(
Authorization = .genesysEnv$Authorization
))
if (httr::http_type(resp) != "application/json") {
......@@ -225,7 +225,7 @@ api1_url <- function(path) {
#'
#' @return httr response
#' @keywords internal
.post <- function(path, query = NULL, body = NULL, content.type = "application/json") {
.post <- function(path, query = NULL, body = NULL, content.type = "application/json", accept = "application/json") {
.check_auth()
content <- jsonlite::toJSON(body)
if (! is.null(body) && length(body) == 0) {
......@@ -233,15 +233,20 @@ api1_url <- function(path) {
content <- "{}"
}
# print(paste("Body is:", content))
resp <- httr::POST(api_url(path), query = query, httr::add_headers(
resp <- httr::POST(path, query = query, httr::add_headers(
Authorization = .genesysEnv$Authorization,
"Content-Type" = content.type
"Content-Type" = content.type,
"Accept" = accept
), body = content)
if (httr::status_code(resp) != 200) {
stop("Genesys responded with HTTP status code ", httr::status_code(resp), ". Expected 200. See response content:\n", httr::content(resp), call. = FALSE)
}
if (httr::http_type(resp) != "application/json") {
stop("API did not return JSON, but Content-Type: ", httr::content(resp), ". See response content:\n", httr::content(resp), call. = FALSE)
if (httr::http_type(resp) != accept) {
stop("API did not return ", accept, " but Content-Type: ", httr::content(resp), ". See response content:\n", httr::content(resp), call. = FALSE)
}
# if (httr::http_type(resp) != "application/json") {
# stop("API did not return json", call. = FALSE)
# }
resp
}
......@@ -13,13 +13,14 @@
# limitations under the License.
#' Max pages to retrieve
#' @keywords internal
.MAX_ALLOWED_PAGES <- 500
#' Who am i?
#'
#' @export
me <- function() {
resp <- .api_call("/me")
resp <- .api_call(api1_url("/me/profile"))
message(jsonlite::toJSON(resp, pretty = TRUE))
invisible(resp)
}
......@@ -40,6 +41,7 @@ me <- function() {
#' }
#'
#' @return Paged data structure
#' @keywords internal
.fetch_accessions_page <- function(filters = list(), page = 0, size = 1000, selector = NULL) {
start_time <- as.numeric(as.numeric(Sys.time())*1000, digits=15)
resp <- .post(path = "/acn/list", query=list(p = page, l = size), body = filters)
......@@ -105,7 +107,7 @@ fetch_accessions <- function(filters = list(), page = NULL, size = 1000, selecto
break
}
if (! is.null(at.least) && at.least <= paged$numberOfElements) {
message(paste("Receved", paged$numberOfElements, "of", at.least, "requested. Stopping."))
message(paste("Received", paged$numberOfElements, "of", at.least, "requested. Stopping."))
break
}
}
......@@ -114,6 +116,106 @@ fetch_accessions <- function(filters = list(), page = NULL, size = 1000, selecto
}
#' List accession passport data (paginated)
#'
#' @return table
#' @keywords internal
.list_accessions_page <- function(filters = list(), page = 0, size = 1000, fields = NULL, selector = NULL) {
start_time <- as.numeric(as.numeric(Sys.time())*1000, digits=15)
query <- list(p = page, l = size)
if (is.vector(fields)) {
selected_fields <- stats::setNames(as.list(fields), rep('select', length(fields)))
query <- c(query, selected_fields)
}
resp <- .post(path = api1_url("/acn/list"), query = query, body = filters, accept = "text/csv")
if (httr::status_code(resp) != 200) {
stop("Genesys responded with HTTP status code ", httr::status_code(resp), ". Expected 200.")
}
if (httr::http_type(resp) != "text/csv") {
stop("API returned ", httr::http_type(resp), ". Expected text/csv.")
}
end_time <- as.numeric(as.numeric(Sys.time())*1000, digits=15)
headers <- httr::headers(resp)
message(paste("Retrieved page", page + 1, "with", headers$`pagination-elements`, "rows in", end_time - start_time, "ms."))
body <- httr::content(resp, "text")
if (nchar(trimws(body)) == 0) {
message(paste("Received 0 bytes"))
data <- data.frame()
} else {
data <- read.csv(text = body, quote = '"', sep = '\t', stringsAsFactors = FALSE)
# Apply selector
if (is.function(selector)) {
data <- lapply(data, selector)
}
}
data
}
#' Fetch accession passport data
#'
#' @param filters an R \code{structure} with Genesys filters
#' @param size number of records to load per page (page size)
#' @param page the page index (0-based)
#' @param fields list of fields to fetch from Genesys
#' @param selector NULL or a function to "select" variables of interest
#' @param at.least stop fetching when at.least records are received from Genesys
#'
#' @seealso \code{\link{mcpd_filter}}
#'
#' @examples
#' \dontrun{
#' # Retrieve all accession data by country of origin (Slovenia, Ivory Coast)
#' accessions <- genesysr::get_accessions(list(countryOfOrigin = list(code3 = c('SVN', 'CIV'))))
#'
#' # Fetch Musa, but only geographic data and accessionNumber
#' musa <- genesysr::get_accessions(list(taxonomy = list(genus = c('Musa'))),
#' fields = c("accessionNumber", "geo"))
#'
#' # Apply selector function
#' accessions <- get_accessions(mcpd_filter(ORIGCTY = c('DEU', 'SVN')),
#' selector = function(x) {
#' list(id = x$id, acceNumb = x$accessionNumber, instCode = x$instituteCode)
#' }, at.least = 100)
#' }
#'
#' @export
#' @return Paged data structure
get_accessions <- function(filters = list(), page = 0, size = 1000, fields = NULL, selector = NULL, at.least = NULL) {
# Fetch first page to determine number of records
data <- .list_accessions_page(filters, page, size, fields, selector)
while (page < .MAX_ALLOWED_PAGES && !(! is.null(at.least) && at.least <= nrow(data))) {
page <- page + 1
if (page >= .MAX_ALLOWED_PAGES) {
# Break if over max pages
message(paste("Not requesting data after page", .MAX_ALLOWED_PAGES, "Stopping."))
break
}
p <- .list_accessions_page(filters, page, size, fields)
if (nrow(p) == 0) {
# print("Got last page")
break
}
data[setdiff(names(p), names(data))] <- NA
p[setdiff(names(data), names(p))] <- NA
data <- rbind(data, p)
if (! is.null(at.least) && at.least <= nrow(data)) {
message(paste("Receved", nrow(data), "of", at.least, "requested. Stopping."))
break
}
}
data
}
#' Download all passport data for one genebank in Excel format and save it to disk
#'
#' @param instituteCode FAO WIEWS institute code
......
......@@ -2,6 +2,8 @@
The `genesysr` R package provides functions for authentication with Genesys and functions to fetch accession data from <https://www.genesys-pgr.org> database.
Note: See NEWS.md
## Installing the development version
```R
......@@ -63,5 +65,9 @@ genesysr::client_login()
```R
filters <- mcpd_filter(ORIGCTY = c("DEU", "SVN"))
accessions <- genesysr::fetch_accessions(filters)
accessions <- genesysr::get_accessions(filters)
# Sort columns, hand-pick first few columns
require(data.table)
setcolorder(accessions, unique(c("id", "instituteCode", "accessionNumber", "taxonomy.genus", sort(names(accessions)))))
```
## Test environments
* local OS X install, R 3.4.4
* devtools::check_win_devel();
* devtools::spell_check()
* devtools::check_win_devel()
* devtools::check_rhub()
## R CMD check results
......@@ -9,7 +10,8 @@
## Other
This is a minor update due to changed hostname for Genesys API to api.genesys-pgr.org.
Updated R code for new Genesys API endpoints. Updated
the user guide and function documentation.
## Reverse dependencies
......
---
title: "genesysr Tutorial"
author: "Nora Castaneda & Matija Obreza"
author: "Matija Obreza & Nora Castaneda"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
......@@ -23,7 +23,7 @@ it into R.
## For the impatient
Accession passport data is retrieved with the `fetch_accessions` function.
Accession passport data is retrieved with the `get_accessions` function.
The database is queried by providing a `filter` (see Filters below):
......@@ -36,36 +36,41 @@ The database is queried by providing a `filter` (see Filters below):
genesysr::user_login()
# Retrieve first 1000 accessions for genus *Musa*
musa <- fetch_accessions(filters = list(taxonomy = list(genus = c('Musa'))), at.least = 1000)
musa <- get_accessions(filters = list(taxonomy = list(genus = c('Musa'))), at.least = 1000)
# Or retrieve all accession data for genus *Musa*
musa <- fetch_accessions(filters = list(taxonomy = list(genus = c('Musa'))))
musa <- get_accessions(filters = list(taxonomy = list(genus = c('Musa'))))
# Retrieve all accession data for the Musa International Transit Center, Bioversity International
itc <- fetch_accessions(list(institute = list(code = c('BEL084'))))
itc <- get_accessions(list(institute = list(code = c('BEL084'))))
# Retrieve all accession data for the Musa International Transit Center, Bioversity International (BEL084) and the International Center for Tropical Agriculture (COL003)
some <- fetch_accessions(list(institute = list(code = c('BEL084','COL003'))))
some <- get_accessions(list(institute = list(code = c('BEL084','COL003'))))
```
**genesysr** provides utility functions to create `filter` objects using [Multi-Crop Passport Descriptors (MCPD)](https://www.genesys-pgr.org/documentation/basics) definitions:
```
# Retrieve data by country of origin (MCPD)
fetch_accessions(mcpd_filter(ORIGCTY = c("DEU", "SVN")))
get_accessions(mcpd_filter(ORIGCTY = c("DEU", "SVN")))
```
# Processing fetched data
Fetched data is provided as a deeply nested list. To flatten the list, the following steps are suggested:
The data is provided by Genesys as CSV. Where multiple values are possible for a column,
there will be multiple columns. For example, accession `STORAGE` may be provided as:
```r
require(tidyverse)
musa.flatten <- lapply(musa$content, unlist) # looks good
musa.flatten <- musa.flatten %>% map_df(bind_rows)
```
|...|storage1|storage2|storage3|
|--|--|--|--|
|...|10|20|30|
|...|30|40|*NA*|
|...|30|*NA*|*NA*|
|...|10|20|30|
# Filters
The `filter` object is a named `list()` where names match a Genesys filter and the value
specifies the criteria to match.
The records returned by Genesys match all filters provided (*AND* operation), while individual filters
allow for specifying multiple criteria (*OR* operation):
......@@ -78,12 +83,14 @@ filter <- list();
filter$taxonomy$genus = c('Musa')
filter$taxonomy$species = c('aa')
filter$countryOfOrigin$iso3 = c('NGA', 'CIV')
```
There are a number of filtering options to retrieve current data from Genesys.
The `filter` object is a named `list()` where names match a Genesys filter and the value
specifies the criteria to match.
# See filter object as JSON
jsonlite::toJSON(filters)
```
There are a number of filtering options to retrieve data from Genesys. Best explore how filtering
works on the actual website https://www.genesys-pgr.org/a/overview by inspecting the HTTP requests
sent by your browser to the API server and then replicating them here.
### Taxonomy
......@@ -140,20 +147,18 @@ list(institute = list(country = list(iso3 = c('SVN', 'BEL'))))
# Selecting columns
Genesys API returns a lot of variables for accession passport data.
To reduce the amount of data to be processed and kept in memory, select the columns of interest with a `selector` function:
To reduce the amount of data to be processed and kept in memory, select the columns of interest the `fields` vector:
```
# Keep only accession id, acceNumb and instCode for *Musa* data
fetch_accessions(list(taxonomy.genus = c('Musa')), at.least = 100, selector = function(x) {
list(id = x$id, acceNumb = x$accessionNumber, instCode = x$institute$code)
})
# Fetch only accession id, storage and taxonomic data for *Musa*
musa <- genesysr::get_accessions(list(taxonomy = list(genus = c('Musa'))), fields = c("taxonomy", "storage", "id"))
```
To list the variable names returned by the Genesys APIs, test the response and select columns of interest:
```r
filters <- list()
accn <- fetch_accessions(filters, at.least = 100)
# fetch_accessions uses the JSON format
accn <- fetch_accessions(filters = list(), at.least = 100)
# Print names used in JSON response from Genesys
sort(unique(names(unlist(accn$content))))
......@@ -171,6 +176,7 @@ library(genesysr)
```
2. Setup using user credentials
```r
setup_sandbox()
user_login()
......@@ -179,13 +185,11 @@ user_login()
3. Fetch data
```r
musa <- genesysr::fetch_accessions(list(taxonomy = list(genus = c('Musa'))), at.least = 1000)
musa <- genesysr::get_accessions(list(taxonomy = list(genus = c('Musa'))), at.least = 1000)
```
4. Flatten data into data frame
4. Identify columns of interest
```r
require(tidyverse)
musa.flatten <- lapply(musa$content, unlist) #looks good
musa.flatten <- musa.flatten %>% map_df(bind_rows)
names(musa)
```
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment