Description
A collection of functions for working with data from the NCBI Pathogen Detection project
Quickstart
Download the most recent metadata for an organism:
system('mkdir data')
download_most_recent_complete('Campylobacter', folder_prefix = './data/')
Extract agricultural host species from several metadata fields:
# create a two column tibble containing consensus host for each isolate
host_info <- extract_consensus_ag_species(meta)
# join back to metadata
meta <- meta %>% left_join(host_info)
Extract earliest year from 3 date columns
earliest_year <- meta %>% extract_earliest_year()
meta <- meta %>% left_join(earliest_year)
Generate ftp download paths for a selection of isolates
# download most recent assembly summary
download_gbk_assembly_summary('./data/assembly_summary.txt')
# select isolates associated with swine from after 2015
meta_filt <-
meta %>%
filter(Year > 2015 & ag_match == 'Swine') %>%
write_tsv('./data/swine_2015_meta.tsv')
# make a 'download data' tibble to organize and track downloads
# downloads both fna and gff files for each genome in the metadata
download_data <-
meta_filt %>%
select(asm_acc, ftp_path) %>%
make_ftp_paths(assembly_summary_path = './data/assembly_summary.txt') %>%
make_download_urls('fna') %>%
make_download_urls('gff') %>%
make_dest_paths(type='fna', dest_dir = './data/') %>%
make_dest_paths(type='gff', dest_dir = './data/') %>%
download_genomes('fna') %>%
download_genomes('gff') %>%
write_tsv('./data/download_data.tsv')
Generate an input file for caclulating a pangenome with ppanggolin
# if you have some reference genomes that are complete (circularized) you can
# feed their paths into the 'complete_genome_paths parameter and the function
# will correctly specify cirular contigs for ppanggolin.
download_reference_genomes('LT2', 'fna', './reference_genomes/')
complete_genomes <- list.files('./reference_genomes/', '.fna', full.names = T)
draft_genomes <- list.files('./data/', '.fna', full.names = T)
build_ppanggolin_file_fastas(complete_genome_paths = complete_genomes,
incomplete_genome_paths = fna_files) %>%
write_tsv('ppanggolin_file.tsv', col_names = FALSE)
Select a representative set of isolates from a pangenome
# Read in presence/absence matrix and format correctly:
pan_PA <-
read_tsv('./pan/gene_presence_absence.Rtab') %>%
column_to_rownames(var = 'Gene') %>%
as.matrix()
# this will return a a small set of genomes that contain at least the proportion of
# genes you specify
# this set of genomes will contain 99% of all the genes detected in the pangenome
get_pangenome_representatives(pan_mat = pan_PA, SEED = 2, desired_coverage = .99)
TODO
- Stats on available organisms, num clusters, most recent isolate https://www.ncbi.nlm.nih.gov/pathogens/organisms/
- update_collection() function?
- Should take and old metadata file and a new metadata file as inputs.
- return a vector of new genomes that were not present in the old list.
- also return genomes with newer assembly accession version
- Should take and old metadata file and a new metadata file as inputs.
- extract_consensus_ag_species currently assumes isolates are from humans if the epi_type is clinical and no other information is available. This is probably wrong in some cases…
- Download a more appropriate assembly summary file
- Or an assembly summary file for a species from the appropriate directory under genbank or refseq. e.g. ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/Salmonella_enterica/assembly_summary.txt