entrez_search()
Using entrez_search() includes the number of records matching a given search.
This means you can learn a little about the composition of, or trends in, the records stored in the NCBI’s databases using only the search utility.
For instance, let’s track the rise of the scientific buzzword “connectome” in PubMed, programmatically creating search terms for the PDAT field:
library(rentrez)
search_year <- function(year, term){
query <- paste(term, "AND (", year, "[PDAT])")
entrez_search(db="pubmed", term=query, retmax=0)$count
}
year <- 2008:2014
papers <- sapply(year, search_year, term="Connectome", USE.NAMES=FALSE)
plot(year, papers, type='b', main="The Rise of the Connectome")
We can also search pubmed abstracts with the rentrez
package
you can also use entrez to search pubmed and the EUtils API allows you to limit searches by the year in which the paper was published.
For example we can find the trendiest -omics going around (this has quite a lot of repeated searching, so you want to run your own version be sure to do it in off peak times).
Let’s start by making a function that finds the number of records matching a given search term for each of several years (using the mindate and maxdate terms from the Eutils API):
library(rentrez)
library(reshape2)
library(ggplot2)
papers_by_year <- function(years, search_term){
return(sapply(years, function(y) entrez_search(db="pubmed", term = search_term, mindate=y, maxdate=y, retmax=0)$count))
}
years <- 1990:2015
total_papers <- papers_by_year(years, "")
omics <- c("genomic", "epigenomic", "metagenomic", "proteomic", "transcriptomic", "pharmacogenomic", "connectomic" )
trend_data <- sapply(omics, function(t) papers_by_year(years, t))
trend_props <- trend_data/total_papers
trend_df <- melt(data.frame(years, trend_props), id.vars="years")
p <- ggplot(trend_df, aes(years, value, colour=variable))
p + geom_line(size=1) + scale_y_log10("number of papers")
## Warning: Transformation introduced infinite values in continuous y-axis
papers_by_year <- function(years, search_term){
return(sapply(years, function(y) entrez_search(db="pubmed",term=search_term, mindate=y, maxdate=y, retmax=0)$count))
}
years <- 1990:2011
total_papers <- papers_by_year(years, "")
omics <- c("genomic", "epigenomic", "metagenomic", "proteomic", "transcriptomic", "pharmacogenomic", "connectomic" )
trend_data <- sapply(omics, function(t) papers_by_year(years, t))
trend_props <- data.frame(trend_data/total_papers)
trend_props$years <- years
trend_df <- melt(as.data.frame(trend_props), id.vars="years")
p <- ggplot(trend_df, aes(years, value, colour=variable))
p + geom_line(size=1) + scale_y_log10("number of papers")
## Warning: Transformation introduced infinite values in continuous y-axis