Skip to content

Commit 90dfef6

Browse files
Merge pull request #497 from WrightonLabCSU/feature/new-database-adds
Feature/new database adds
2 parents cdfe210 + 57efff6 commit 90dfef6

33 files changed

Lines changed: 1767 additions & 102 deletions

CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,27 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## 2.0.0-beta28 - 2026-04-15
6+
7+
[cdfe210](https://github.com/WrightonLabCSU/DRAM/commit/cdfe210ca64eb95baf6f1acedb62f91b74630181)...[e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768)
8+
9+
### Features
10+
11+
- Add antiSMASH, CARD, RGI, TCDB ([8d08d1f](https://github.com/WrightonLabCSU/DRAM/commit/8d08d1f9d54fb139eb53587754e569c4317ddc37))
12+
13+
Add antiSMASH nextflow module, right now just collect antismash
14+
raw output while we work on incorporating raw output into
15+
larger pipeline
16+
Add rgi nextflow module, right now like antiSMASH, only
17+
collect raw output while we work on incorporating
18+
ADD CARD db processing with mmseqs
19+
ADD TCDB processing with mmseqs
20+
21+
- Add DRAM DB HMMs = ([e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768))
22+
23+
Add DRAM team curated HMM database as new annotation db option.
24+
Work in progress and testing database, but can be found on GLOBUS.
25+
526
## 2.0.0-beta27 - 2026-03-18
627

728
[f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8)

bin/combine_annotations.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ def combine_annotations(annotations_dir, genes_dir, output, threads):
170170
combined_data[FASTA_COLUMN] = combined_data[FASTA_COLUMN].where(
171171
mask, other=combined_data[FASTA_COLUMN + "2"]
172172
)
173+
# TODO: fix the merge so it doesn't make this column
174+
combined_data = combined_data.drop(columns=FASTA_COLUMN + "2")
173175

174176
combined_data = convert_bit_scores_to_numeric(combined_data)
175177

bin/hmm_parser.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -196,11 +196,6 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
196196
hits["perc_cov"] = (hits["model_end"] - hits["model_start"] + 1) / hits[
197197
"query_length"
198198
]
199-
hits[f"{db_name}_id"] = hits["query_name"].str.replace(r".hmm", "", regex=True)
200-
all_hits = get_all_hits(hits, db_name)
201-
all_hits.name = f"{db_name}_ids"
202-
hits = hits.merge(all_hits, how="left", left_on="query_id", right_index=True)
203-
204199
hmm_sheet = False
205200
if hmm_info_path is not None:
206201
hmm_sheet = True
@@ -228,8 +223,11 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
228223
pass
229224
elif "definition" in hmm_info.columns:
230225
hmm_info = hmm_info.rename(columns={"definition": "description"})
231-
elif pd.api.types.is_string_dtype(hmm_info.iloc[:, -1]):
232-
hmm_info = hmm_info.rename(columns={hmm_info.columns[-1]: "description"})
226+
elif (
227+
pd.api.types.is_string_dtype(hmm_info.iloc[:, -1])
228+
and hmm_info.columns[-1] not in merge_cols
229+
): # don't need to worry about description in merge cols, cause already checked
230+
hmm_info["deescription"] = hmm_info[hmm_info.columns[-1]].copy()
233231
else:
234232
raise_on_ec = True
235233

@@ -243,10 +241,13 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
243241
)
244242

245243
merge_cols = [col for col in merge_cols if col in hmm_info.columns]
246-
244+
print(hmm_info.columns)
245+
print(hmm_info)
247246
hits = hits.merge(
248247
hmm_info[merge_cols], how="left", left_on="query_name", right_index=True
249248
)
249+
print(hits.columns)
250+
print(hits)
250251
hits_sig = sig_scores_row_by_row(hits, db_name=db_name)
251252
drop_cols = [
252253
col
@@ -268,6 +269,15 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
268269
# df.to_csv(output, index=False)
269270
return
270271

272+
hits_sig[f"{db_name}_id"] = hits_sig["query_name"].str.replace(
273+
r".hmm", "", regex=True
274+
)
275+
all_hits_sig = get_all_hits(hits_sig, db_name)
276+
all_hits_sig.name = f"{db_name}_ids"
277+
hits_sig = hits_sig.merge(
278+
all_hits_sig, how="left", left_on="query_id", right_index=True
279+
)
280+
271281
# Get the best hit
272282
# hits_sig = hits_sig.sort_values(['full_evalue', "domain_ievalue", "perc_cov"], ascending=[True, True, False]).drop_duplicates(subset=["query_id"])
273283
hits_sig = hits_sig.sort_values(

bin/hmm_search.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,35 @@
1919
help="Path to the input fasta to search against",
2020
)
2121
@click.option("--e_value", type=float, help="e value cutoff for filtering")
22+
@click.option("--t_value", type=float, help="bitscore cutoff for filtering")
2223
@click.option(
2324
"--output_file",
2425
type=click.Path(),
2526
help="Path to output file",
2627
)
2728
@click.option("--cpus", type=int, help="number of cpu core to run HMMER with")
28-
def main(hmm, input_file, e_value, output_file, cpus):
29+
def main(hmm, input_file, e_value, t_value, output_file, cpus):
2930
t1 = time.time()
3031

3132
hmm = Path(hmm)
32-
33-
hmm_paths = hmm.parent.glob(hmm.name)
33+
if hmm.is_dir(): # if directory passed, glob all hmms in dir
34+
hmm = hmm / "*.hmm"
35+
if "*" in str(hmm) or "?" in str(hmm): # check if path is glob path
36+
hmm_paths = hmm.parent.glob(hmm.name)
37+
else:
38+
hmm_paths = [hmm]
3439

3540
hmms = []
3641
for path in hmm_paths:
3742
with pyhmmer.plan7.HMMFile(path) as hmm_file:
3843
hmms.extend(hmm_file)
3944

4045
print(hmms)
46+
kw = {}
47+
if t_value:
48+
kw["T"] = t_value
49+
elif e_value:
50+
kw["E"] = e_value
4151

4252
with open(output_file, "wb") as out_fh:
4353
with pyhmmer.easel.SequenceFile(
@@ -46,7 +56,7 @@ def main(hmm, input_file, e_value, output_file, cpus):
4656
seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet)
4757
seqs.extend(sf)
4858
first = True
49-
for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value):
59+
for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, **kw):
5060
hits.write(out_fh, format="domains", header=first)
5161
first = False
5262
# total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15))

bin/utils/click_utils.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
#!/usr/bin/env python
2-
def validate_comma_separated(ctx, param, value, split=(",", " ")):
2+
def validate_comma_separated(ctx, param, value, split=(",", " "), converter=None):
33
if not value:
44
return []
55
if isinstance(value, (list, tuple)):
66
s = split if isinstance(split, str) else split[0]
77
value = s.join(value)
88
if isinstance(split, str):
9+
split = [split]
910
return value.split(split)
1011
if isinstance(split, (list, tuple)):
12+
sentinel = "|SENTINEL|"
1113
for s in split:
12-
value = value.replace(s, ",")
13-
return [val.strip() for val in value.split(",")]
14+
value = value.replace(s, sentinel)
15+
ls = []
16+
for val in value.split(sentinel):
17+
val = val.strip()
18+
if converter:
19+
val = converter(val)
20+
ls.append(val)
21+
return ls

modules.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,25 @@
55
"https://github.com/nf-core/modules.git": {
66
"modules": {
77
"nf-core": {
8+
"antismash/antismash": {
9+
"branch": "master",
10+
"git_sha": "96c57dfd98a0641886a67bd449fe33ee2ec0e374",
11+
"installed_by": ["modules"]
12+
},
13+
"antismash/antismashdownloaddatabases": {
14+
"branch": "master",
15+
"git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
16+
"installed_by": ["modules"]
17+
},
818
"multiqc": {
919
"branch": "master",
1020
"git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
1121
"installed_by": ["modules"]
22+
},
23+
"rgi/main": {
24+
"branch": "master",
25+
"git_sha": "5e748ff2b0f990949081c9e49792622eb3fe9ee9",
26+
"installed_by": ["modules"]
1227
}
1328
}
1429
},

modules/local/annotate/hmmsearch.nf

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,13 @@ process HMM_SEARCH {
2222
script:
2323
def args = task.ext.args ?: ""
2424
def ec_flag = ec_from_info ? "--ec_from_info" : ""
25+
def cutoff_flag = e_value ? "--e_value ${e_value}" : ""
2526

2627
"""
2728
hmm_search.py \\
28-
--hmm ${database_loc}/*.hmm \\
29+
--hmm ${database_loc} \\
2930
--input_file ${fasta} \\
30-
--e_value ${e_value} \\
31+
${cutoff_flag} \\
3132
--output_file ${input_fasta}_hmmsearch.out \\
3233
--cpus ${task.cpus}
3334

modules/local/annotate/mmseqs_search.nf

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ process MMSEQS_SEARCH {
3636
# Perform search
3737
mmseqs search query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/tmp --threads ${task.cpus}
3838
39-
# Filter to only best hit
40-
mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb --extract-lines 1
41-
4239
# Filter to only hits with minimum bit score
43-
mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb
40+
mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb
41+
42+
# Filter to only best hit
43+
mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb --extract-lines 1
4444
4545
# Convert results to BLAST outformat 6
46-
mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus}
46+
mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus}
4747
4848
# if statement for kegg rbh goes here
4949
elif [ "${db_name}" == "pfam" ]; then
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- "bioconda::antismash=8.0.1"
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
process ANTISMASH_ANTISMASH {
2+
tag "${meta.id}"
3+
label 'process_medium'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "nf-core/antismash:8.0.1--pyhdfd78af_0"
7+
8+
input:
9+
tuple val(meta), path(sequence_input)
10+
path databases
11+
path gff
12+
13+
output:
14+
tuple val(meta), path("${prefix}/{css,images,js}") , emit: html_accessory_files
15+
tuple val(meta), path("${prefix}/*.gbk") , emit: gbk_input
16+
tuple val(meta), path("${prefix}/*.json") , emit: json_results
17+
tuple val(meta), path("${prefix}/*.log") , emit: log
18+
tuple val(meta), path("${prefix}/*.zip") , emit: zip
19+
tuple val(meta), path("${prefix}/index.html") , emit: html
20+
tuple val(meta), path("${prefix}/regions.js") , emit: json_sideloading
21+
tuple val(meta), path("${prefix}/clusterblast/*_c*.txt") , emit: clusterblast_file , optional: true
22+
tuple val(meta), path("${prefix}/knownclusterblast/region*/ctg*.html"), emit: knownclusterblast_html , optional: true
23+
tuple val(meta), path("${prefix}/knownclusterblast/") , emit: knownclusterblast_dir , optional: true
24+
tuple val(meta), path("${prefix}/knownclusterblast/*_c*.txt") , emit: knownclusterblast_txt , optional: true
25+
tuple val(meta), path("${prefix}/svg/clusterblast*.svg") , emit: svg_files_clusterblast , optional: true
26+
tuple val(meta), path("${prefix}/svg/knownclusterblast*.svg") , emit: svg_files_knownclusterblast, optional: true
27+
tuple val(meta), path("${prefix}/*region*.gbk") , emit: gbk_results , optional: true
28+
tuple val(meta), path("${prefix}/clusterblastoutput.txt") , emit: clusterblastoutput , optional: true
29+
tuple val(meta), path("${prefix}/knownclusterblastoutput.txt") , emit: knownclusterblastoutput , optional: true
30+
path "versions.yml" , emit: versions
31+
32+
when:
33+
task.ext.when == null || task.ext.when
34+
35+
script:
36+
def args = task.ext.args ?: ''
37+
prefix = task.ext.prefix ?: "${meta.id}"
38+
gff_flag = gff ? "--genefinding-gff3 ${gff}" : ""
39+
40+
"""
41+
## We specifically do not include on-the-fly annotations (--genefinding-tool none) as
42+
## this should be run as a separate module for versioning purposes
43+
44+
antismash \\
45+
${args} \\
46+
${gff_flag} \\
47+
-c ${task.cpus} \\
48+
--output-dir ${prefix} \\
49+
--output-basename ${prefix} \\
50+
--genefinding-tool none \\
51+
--logfile ${prefix}/${prefix}.log \\
52+
--databases ${databases} \\
53+
${sequence_input}
54+
55+
cat <<-END_VERSIONS > versions.yml
56+
"${task.process}":
57+
antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g')
58+
END_VERSIONS
59+
"""
60+
61+
stub:
62+
prefix = task.ext.prefix ?: "${meta.id}"
63+
"""
64+
mkdir -p ${prefix}/css
65+
mkdir ${prefix}/images
66+
mkdir ${prefix}/js
67+
touch ${prefix}/NZ_CP069563.1.region001.gbk
68+
touch ${prefix}/NZ_CP069563.1.region002.gbk
69+
touch ${prefix}/css/bacteria.css
70+
touch ${prefix}/genome.gbk
71+
touch ${prefix}/genome.json
72+
touch ${prefix}/genome.zip
73+
touch ${prefix}/images/about.svg
74+
touch ${prefix}/index.html
75+
touch ${prefix}/js/antismash.js
76+
touch ${prefix}/js/jquery.js
77+
touch ${prefix}/regions.js
78+
touch ${prefix}/test.log
79+
80+
cat <<-END_VERSIONS > versions.yml
81+
"${task.process}":
82+
antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g')
83+
END_VERSIONS
84+
"""
85+
}

0 commit comments

Comments
 (0)