@@ -17,6 +17,12 @@ import textwrap
1717import argparse
1818import shutil
1919import filecmp
20+ import ssl
21+ ###########
22+ # temp fix for certificate failure: https://github.com/AstrobioMike/GToTree/issues/107
23+ ssl ._create_default_https_context = ssl ._create_unverified_context
24+ ###########
25+
2026
2127parser = argparse .ArgumentParser (description = "This is a helper program to facilitate using taxonomy \
2228 and genomes from the Genome Taxonomy Database (gtdb.ecogenomic.org) with GToTree.\
@@ -53,7 +59,9 @@ parser.add_argument("--do-not-check-GTDB-version", action="store_true", help="By
5359parser .add_argument ("--store-GTDB-metadata-in-current-working-dir" , action = "store_true" , help = "By default, GToTree uses a system-wide variable \
5460 to know where to put and search the GTDB metadata. Provide this flag to \
5561 ignore that and store the master table in the current working directory." )
56-
62+ parser .add_argument ("--use-ecogenomics" , action = "store_true" , help = "By default, we try to pull the data from 'https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/' \
63+ instead of 'https://data.gtdb.ecogenomic.org/releases/latest/'. Add this flag to try to pull from \
64+ the ecogenomics site (might be much slower depending on where you are)." )
5765
5866if len (sys .argv )== 1 :
5967 parser .print_help (sys .stderr )
@@ -408,15 +416,19 @@ def gen_gtdb_tab(location):
408416 """ downloads and parses the GTDB info tables """
409417
410418 # getting archaea
411- # arc_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz")
412- arc_tsv_gz = urllib .request .urlopen ("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz" )
419+ if args .use_ecogenomics :
420+ arc_tsv_gz = urllib .request .urlopen ("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz" )
421+ else :
422+ arc_tsv_gz = urllib .request .urlopen ("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz" )
413423 arc_tab = pd .read_csv (arc_tsv_gz , sep = "\t " , compression = "gzip" , on_bad_lines = 'skip' , header = 0 , low_memory = False )
414424 arc_tab .rename (columns = {arc_tab .columns [0 ]:"accession" }, inplace = True )
415425 arc_tab .dropna (inplace = True , how = "all" )
416426
417427 # getting bacteria
418- # bac_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz")
419- bac_tsv_gz = urllib .request .urlopen ("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz" )
428+ if args .use_ecogenomics :
429+ bac_tsv_gz = urllib .request .urlopen ("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz" )
430+ else :
431+ bac_tsv_gz = urllib .request .urlopen ("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz" )
420432 bac_tab = pd .read_csv (bac_tsv_gz , sep = "\t " , compression = "gzip" , on_bad_lines = 'skip' , header = 0 , low_memory = False )
421433 bac_tab .rename (columns = {bac_tab .columns [0 ]:"accession" }, inplace = True )
422434 bac_tab .dropna (inplace = True , how = "all" )
@@ -461,10 +473,15 @@ def gen_gtdb_tab(location):
461473 gtdb_tab .to_csv (location + "GTDB-arc-and-bac-metadata.tsv" , index = False , sep = "\t " )
462474
463475 # gtdb changed from using VERSION to using VERSION.txt at some point, so putting in a try/except to shoot for both in case it's doing a version comparison
476+ if args .use_ecogenomics :
477+ gtdb_version_link = "https://data.gtdb.ecogenomic.org/releases/latest/VERSION"
478+ else :
479+ gtdb_version_link = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION"
480+
464481 try :
465- gtdb_version_info = urllib .request .urlretrieve ("https://data.gtdb.ecogenomic.org/releases/latest/VERSION .txt" , location + "GTDB-version-info.txt" )
482+ gtdb_version_info = urllib .request .urlretrieve (f" { gtdb_version_link } .txt" , location + "GTDB-version-info.txt" )
466483 except :
467- gtdb_version_info = urllib .request .urlretrieve ("https://data.gtdb.ecogenomic.org/releases/latest/VERSION" , location + "GTDB-version-info.txt" )
484+ gtdb_version_info = urllib .request .urlretrieve (gtdb_version_link , location + "GTDB-version-info.txt" )
468485
469486 return (gtdb_tab )
470487
@@ -488,10 +505,15 @@ def check_stored_gtdb_up_to_date(location):
488505
489506 # getting latest version info from GTDB
490507 # gtdb changed from using VERSION to using VERSION.txt at some point, so putting in a try/except to shoot for both in case it's doing a version comparison
508+ if args .ecogenomics :
509+ gtdb_version_link = "https://data.gtdb.ecogenomic.org/releases/latest/VERSION"
510+ else :
511+ gtdb_version_link = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION"
512+
491513 try :
492- gtdb_version_info = urllib .request .urlretrieve ("https://data.gtdb.ecogenomic.org/releases/latest/VERSION .txt" , location + "GTDB-latest-version-info.txt" )
514+ gtdb_version_info = urllib .request .urlretrieve (f" { gtdb_version_link } .txt" , location + "GTDB-latest-version-info.txt" )
493515 except :
494- gtdb_version_info = urllib .request .urlretrieve ("https://data.gtdb.ecogenomic.org/releases/latest/VERSION" , location + "GTDB-latest-version-info.txt" )
516+ gtdb_version_info = urllib .request .urlretrieve (gtdb_version_link , location + "GTDB-latest-version-info.txt" )
495517
496518 # comparing vs old
497519 if filecmp .cmp (location + "GTDB-latest-version-info.txt" , location + "GTDB-version-info.txt" ):
0 commit comments