Skip to content

Commit 3d3b82d

Browse files
committed
addressing certificate failed issue and adding flag to pull from other location to bit-get-accessions-from-GTDB thanks to #107
1 parent b78d0a2 commit 3d3b82d

1 file changed

Lines changed: 31 additions & 9 deletions

File tree

bin/gtt-get-accessions-from-GTDB

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ import textwrap
1717
import argparse
1818
import shutil
1919
import filecmp
20+
import ssl
21+
###########
22+
# temp fix for certificate failure: https://github.com/AstrobioMike/GToTree/issues/107
23+
ssl._create_default_https_context = ssl._create_unverified_context
24+
###########
25+
2026

2127
parser = argparse.ArgumentParser(description="This is a helper program to facilitate using taxonomy \
2228
and genomes from the Genome Taxonomy Database (gtdb.ecogenomic.org) with GToTree.\
@@ -53,7 +59,9 @@ parser.add_argument("--do-not-check-GTDB-version", action="store_true", help="By
5359
parser.add_argument("--store-GTDB-metadata-in-current-working-dir", action="store_true", help="By default, GToTree uses a system-wide variable \
5460
to know where to put and search the GTDB metadata. Provide this flag to \
5561
ignore that and store the master table in the current working directory.")
56-
62+
parser.add_argument("--use-ecogenomics", action="store_true", help="By default, we try to pull the data from 'https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/' \
63+
instead of 'https://data.gtdb.ecogenomic.org/releases/latest/'. Add this flag to try to pull from \
64+
the ecogenomics site (might be much slower depending on where you are).")
5765

5866
if len(sys.argv)==1:
5967
parser.print_help(sys.stderr)
@@ -408,15 +416,19 @@ def gen_gtdb_tab(location):
408416
""" downloads and parses the GTDB info tables """
409417

410418
# getting archaea
411-
# arc_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz")
412-
arc_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz")
419+
if args.use_ecogenomics:
420+
arc_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz")
421+
else:
422+
arc_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz")
413423
arc_tab = pd.read_csv(arc_tsv_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False)
414424
arc_tab.rename(columns={arc_tab.columns[0]:"accession"}, inplace=True)
415425
arc_tab.dropna(inplace=True, how="all")
416426

417427
# getting bacteria
418-
# bac_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz")
419-
bac_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz")
428+
if args.use_ecogenomics:
429+
bac_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz")
430+
else:
431+
bac_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz")
420432
bac_tab = pd.read_csv(bac_tsv_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False)
421433
bac_tab.rename(columns={bac_tab.columns[0]:"accession"}, inplace=True)
422434
bac_tab.dropna(inplace=True, how="all")
@@ -461,10 +473,15 @@ def gen_gtdb_tab(location):
461473
gtdb_tab.to_csv(location + "GTDB-arc-and-bac-metadata.tsv", index=False, sep="\t")
462474

463475
# gtdb changed from using VERSION to using VERSION.txt at some point, so putting in a try/except to shoot for both in case it's doing a version comparison
476+
if args.use_ecogenomics:
477+
gtdb_version_link = "https://data.gtdb.ecogenomic.org/releases/latest/VERSION"
478+
else:
479+
gtdb_version_link = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION"
480+
464481
try:
465-
gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt", location + "GTDB-version-info.txt")
482+
gtdb_version_info = urllib.request.urlretrieve(f"{gtdb_version_link}.txt", location + "GTDB-version-info.txt")
466483
except:
467-
gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION", location + "GTDB-version-info.txt")
484+
gtdb_version_info = urllib.request.urlretrieve(gtdb_version_link, location + "GTDB-version-info.txt")
468485

469486
return(gtdb_tab)
470487

@@ -488,10 +505,15 @@ def check_stored_gtdb_up_to_date(location):
488505

489506
# getting latest version info from GTDB
490507
# gtdb changed from using VERSION to using VERSION.txt at some point, so putting in a try/except to shoot for both in case it's doing a version comparison
508+
if args.ecogenomics:
509+
gtdb_version_link = "https://data.gtdb.ecogenomic.org/releases/latest/VERSION"
510+
else:
511+
gtdb_version_link = "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION"
512+
491513
try:
492-
gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt", location + "GTDB-latest-version-info.txt")
514+
gtdb_version_info = urllib.request.urlretrieve(f"{gtdb_version_link}.txt", location + "GTDB-latest-version-info.txt")
493515
except:
494-
gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION", location + "GTDB-latest-version-info.txt")
516+
gtdb_version_info = urllib.request.urlretrieve(gtdb_version_link, location + "GTDB-latest-version-info.txt")
495517

496518
# comparing vs old
497519
if filecmp.cmp(location + "GTDB-latest-version-info.txt", location + "GTDB-version-info.txt"):

0 commit comments

Comments
 (0)