Tools to interact with NCBI's Pathogen Detection project • pdtools

Installation

remotes::install_github('jtrachsel/pdtools')

Description

A collection of functions for working with data from the NCBI Pathogen Detection project

Quickstart

List available organisms

library(pdtools)

list_organisms()

Download the most recent metadata for an organism:

system('mkdir data')
download_most_recent_complete('Campylobacter', folder_prefix = './data/')

Join downloads: metadata and SNP clusters

# The names of these files will change based on the most recent complete data
# you will have to adjust these
meta <- readr::read_tsv('./data/PDG000000003.1540.amr.metadata.tsv') %>% 
  dplyr::left_join(readr::read_tsv('./data/PDG000000003.1540.cluster_list.tsv'))

Extract agricultural host species from several metadata fields:

# create a two column tibble containing consensus host for each isolate
host_info <- extract_consensus_ag_species(meta)

# join back to metadata
meta <- meta %>% left_join(host_info)

Extract earliest year from 3 date columns

earliest_year <- meta %>% extract_earliest_year()

meta <- meta %>% left_join(earliest_year)

Generate ftp download paths for a selection of isolates

# download most recent assembly summary
download_gbk_assembly_summary('./data/assembly_summary.txt')

# select isolates associated with swine from after 2015
meta_filt <-
  meta %>%
  filter(Year > 2015 & ag_match == 'Swine') %>% 
  write_tsv('./data/swine_2015_meta.tsv')


# make a 'download data' tibble to organize and track downloads
# downloads both fna and gff files for each genome in the metadata
download_data <- 
  meta_filt %>% 
  select(asm_acc, ftp_path) %>% 
  make_ftp_paths(assembly_summary_path = './data/assembly_summary.txt') %>% 
  make_download_urls('fna') %>% 
  make_download_urls('gff') %>% 
  make_dest_paths(type='fna', dest_dir = './data/') %>% 
  make_dest_paths(type='gff', dest_dir = './data/') %>% 
  download_genomes('fna') %>% 
  download_genomes('gff') %>% 
  write_tsv('./data/download_data.tsv')

Generate an input file for caclulating a pangenome with ppanggolin

# if you have some reference genomes that are complete (circularized) you can 
# feed their paths into the 'complete_genome_paths parameter and the function
# will correctly specify cirular contigs for ppanggolin.  

download_reference_genomes('LT2', 'fna', './reference_genomes/')
complete_genomes <- list.files('./reference_genomes/', '.fna', full.names = T)

draft_genomes <- list.files('./data/', '.fna', full.names = T)


build_ppanggolin_file_fastas(complete_genome_paths = complete_genomes, 
                             incomplete_genome_paths = fna_files) %>% 
  write_tsv('ppanggolin_file.tsv', col_names = FALSE)

Select a representative set of isolates from a pangenome

# Read in presence/absence matrix and format correctly:
pan_PA <-
  read_tsv('./pan/gene_presence_absence.Rtab')  %>% 
  column_to_rownames(var = 'Gene')  %>% 
  as.matrix() 

# this will return a a small set of genomes that contain at least the proportion of
# genes you specify

# this set of genomes will contain 99% of all the genes detected in the pangenome
get_pangenome_representatives(pan_mat = pan_PA, SEED = 2, desired_coverage = .99)

TODO

Stats on available organisms, num clusters, most recent isolate https://www.ncbi.nlm.nih.gov/pathogens/organisms/
update_collection() function?
- Should take and old metadata file and a new metadata file as inputs.
- return a vector of new genomes that were not present in the old list.
- also return genomes with newer assembly accession version
extract_consensus_ag_species currently assumes isolates are from humans if the epi_type is clinical and no other information is available. This is probably wrong in some cases…
Download a more appropriate assembly summary file
- Or an assembly summary file for a species from the appropriate directory under genbank or refseq. e.g. ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/Salmonella_enterica/assembly_summary.txt