WrightonLabCSU
diff --git a/‎CHANGELOG.md‎
Lines changed: 21 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎bin/combine_annotations.py‎
Lines changed: 2 additions & 0 deletions b/‎bin/combine_annotations.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bin/hmm_parser.py‎
Lines changed: 18 additions & 8 deletions b/‎bin/hmm_parser.py‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎bin/hmm_search.py‎
Lines changed: 14 additions & 4 deletions b/‎bin/hmm_search.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎bin/utils/click_utils.py‎
Lines changed: 11 additions & 3 deletions b/‎bin/utils/click_utils.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎modules.json‎
Lines changed: 15 additions & 0 deletions b/‎modules.json‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎modules/local/annotate/hmmsearch.nf‎
Lines changed: 3 additions & 2 deletions b/‎modules/local/annotate/hmmsearch.nf‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎modules/local/annotate/mmseqs_search.nf‎
Lines changed: 5 additions & 5 deletions b/‎modules/local/annotate/mmseqs_search.nf‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎modules/nf-core/antismash/antismash/environment.yml‎
Lines changed: 7 additions & 0 deletions b/‎modules/nf-core/antismash/antismash/environment.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎modules/nf-core/antismash/antismash/main.nf‎
Lines changed: 85 additions & 0 deletions b/‎modules/nf-core/antismash/antismash/main.nf‎
Lines changed: 85 additions & 0 deletions
@@ -2,6 +2,27 @@
 
 All notable changes to this project will be documented in this file.
 
+## 2.0.0-beta28 - 2026-04-15
+
+[cdfe210](https://github.com/WrightonLabCSU/DRAM/commit/cdfe210ca64eb95baf6f1acedb62f91b74630181)...[e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768)
+
+### Features
+
+- Add antiSMASH, CARD, RGI, TCDB ([8d08d1f](https://github.com/WrightonLabCSU/DRAM/commit/8d08d1f9d54fb139eb53587754e569c4317ddc37))
+
+  Add antiSMASH nextflow module, right now just collect antismash
+  raw output while we work on incorporating raw output into
+  larger pipeline
+  Add rgi nextflow module, right now like antiSMASH, only
+  collect raw output while we work on incorporating
+  ADD CARD db processing with mmseqs
+  ADD TCDB processing with mmseqs
+
+- Add DRAM DB HMMs = ([e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768))
+
+  Add DRAM team curated HMM database as new annotation db option.
+  Work in progress and testing database, but can be found on GLOBUS.
+
 ## 2.0.0-beta27 - 2026-03-18
 
 [f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8)
 
@@ -170,6 +170,8 @@ def combine_annotations(annotations_dir, genes_dir, output, threads):
         combined_data[FASTA_COLUMN] = combined_data[FASTA_COLUMN].where(
             mask, other=combined_data[FASTA_COLUMN + "2"]
         )
+        # TODO: fix the merge so it doesn't make this column
+        combined_data = combined_data.drop(columns=FASTA_COLUMN + "2")
 
     combined_data = convert_bit_scores_to_numeric(combined_data)
 
 
@@ -196,11 +196,6 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
     hits["perc_cov"] = (hits["model_end"] - hits["model_start"] + 1) / hits[
         "query_length"
     ]
-    hits[f"{db_name}_id"] = hits["query_name"].str.replace(r".hmm", "", regex=True)
-    all_hits = get_all_hits(hits, db_name)
-    all_hits.name = f"{db_name}_ids"
-    hits = hits.merge(all_hits, how="left", left_on="query_id", right_index=True)
-
     hmm_sheet = False
     if hmm_info_path is not None:
         hmm_sheet = True
@@ -228,8 +223,11 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
             pass
         elif "definition" in hmm_info.columns:
             hmm_info = hmm_info.rename(columns={"definition": "description"})
-        elif pd.api.types.is_string_dtype(hmm_info.iloc[:, -1]):
-            hmm_info = hmm_info.rename(columns={hmm_info.columns[-1]: "description"})
+        elif (
+            pd.api.types.is_string_dtype(hmm_info.iloc[:, -1])
+            and hmm_info.columns[-1] not in merge_cols
+        ):  # don't need to worry about description in merge cols, cause already checked
+            hmm_info["deescription"] = hmm_info[hmm_info.columns[-1]].copy()
         else:
             raise_on_ec = True
 
@@ -243,10 +241,13 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
             )
 
         merge_cols = [col for col in merge_cols if col in hmm_info.columns]
-
+        print(hmm_info.columns)
+        print(hmm_info)
         hits = hits.merge(
             hmm_info[merge_cols], how="left", left_on="query_name", right_index=True
         )
+        print(hits.columns)
+        print(hits)
         hits_sig = sig_scores_row_by_row(hits, db_name=db_name)
         drop_cols = [
             col
@@ -268,6 +269,15 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
         # df.to_csv(output, index=False)
         return
 
+    hits_sig[f"{db_name}_id"] = hits_sig["query_name"].str.replace(
+        r".hmm", "", regex=True
+    )
+    all_hits_sig = get_all_hits(hits_sig, db_name)
+    all_hits_sig.name = f"{db_name}_ids"
+    hits_sig = hits_sig.merge(
+        all_hits_sig, how="left", left_on="query_id", right_index=True
+    )
+
     # Get the best hit
     # hits_sig = hits_sig.sort_values(['full_evalue', "domain_ievalue", "perc_cov"], ascending=[True, True, False]).drop_duplicates(subset=["query_id"])
     hits_sig = hits_sig.sort_values(
 
@@ -19,25 +19,35 @@
     help="Path to the input fasta to search against",
 )
 @click.option("--e_value", type=float, help="e value cutoff for filtering")
+@click.option("--t_value", type=float, help="bitscore cutoff for filtering")
 @click.option(
     "--output_file",
     type=click.Path(),
     help="Path to output file",
 )
 @click.option("--cpus", type=int, help="number of cpu core to run HMMER with")
-def main(hmm, input_file, e_value, output_file, cpus):
+def main(hmm, input_file, e_value, t_value, output_file, cpus):
     t1 = time.time()
 
     hmm = Path(hmm)
-
-    hmm_paths = hmm.parent.glob(hmm.name)
+    if hmm.is_dir():  # if directory passed, glob all hmms in dir
+        hmm = hmm / "*.hmm"
+    if "*" in str(hmm) or "?" in str(hmm):  # check if path is glob path
+        hmm_paths = hmm.parent.glob(hmm.name)
+    else:
+        hmm_paths = [hmm]
 
     hmms = []
     for path in hmm_paths:
         with pyhmmer.plan7.HMMFile(path) as hmm_file:
             hmms.extend(hmm_file)
 
     print(hmms)
+    kw = {}
+    if t_value:
+        kw["T"] = t_value
+    elif e_value:
+        kw["E"] = e_value
 
     with open(output_file, "wb") as out_fh:
         with pyhmmer.easel.SequenceFile(
@@ -46,7 +56,7 @@ def main(hmm, input_file, e_value, output_file, cpus):
             seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet)
             seqs.extend(sf)
             first = True
-            for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value):
+            for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, **kw):
                 hits.write(out_fh, format="domains", header=first)
                 first = False
             # total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15))
 
@@ -1,13 +1,21 @@
 #!/usr/bin/env python
-def validate_comma_separated(ctx, param, value, split=(",", " ")):
+def validate_comma_separated(ctx, param, value, split=(",", " "), converter=None):
     if not value:
         return []
     if isinstance(value, (list, tuple)):
         s = split if isinstance(split, str) else split[0]
         value = s.join(value)
     if isinstance(split, str):
+        split = [split]
         return value.split(split)
     if isinstance(split, (list, tuple)):
+        sentinel = "|SENTINEL|"
         for s in split:
-            value = value.replace(s, ",")
-        return [val.strip() for val in value.split(",")]
+            value = value.replace(s, sentinel)
+        ls = []
+        for val in value.split(sentinel):
+            val = val.strip()
+            if converter:
+                val = converter(val)
+            ls.append(val)
+        return ls
@@ -5,10 +5,25 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "antismash/antismash": {
+                        "branch": "master",
+                        "git_sha": "96c57dfd98a0641886a67bd449fe33ee2ec0e374",
+                        "installed_by": ["modules"]
+                    },
+                    "antismash/antismashdownloaddatabases": {
+                        "branch": "master",
+                        "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
                         "installed_by": ["modules"]
+                    },
+                    "rgi/main": {
+                        "branch": "master",
+                        "git_sha": "5e748ff2b0f990949081c9e49792622eb3fe9ee9",
+                        "installed_by": ["modules"]
                     }
                 }
             },
 
@@ -22,12 +22,13 @@ process HMM_SEARCH {
     script:
     def args = task.ext.args ?: ""
     def ec_flag = ec_from_info ? "--ec_from_info" : ""
+    def cutoff_flag = e_value ? "--e_value ${e_value}" : ""
 
     """
     hmm_search.py \\
-        --hmm  ${database_loc}/*.hmm \\
+        --hmm  ${database_loc} \\
         --input_file ${fasta} \\
-        --e_value ${e_value} \\
+        ${cutoff_flag} \\
         --output_file ${input_fasta}_hmmsearch.out \\
         --cpus ${task.cpus}
 
 
@@ -36,14 +36,14 @@ process MMSEQS_SEARCH {
         # Perform search
         mmseqs search query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/tmp --threads ${task.cpus}
 
-        # Filter to only best hit
-        mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb --extract-lines 1
-
         # Filter to only hits with minimum bit score
-        mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb
+        mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb
+
+        # Filter to only best hit
+        mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb --extract-lines 1
 
         # Convert results to BLAST outformat 6
-        mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb  mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus}
+        mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb  mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus}
 
         # if statement for kegg rbh goes here
     elif [ "${db_name}" == "pfam" ]; then
 
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::antismash=8.0.1"
@@ -0,0 +1,85 @@
+process ANTISMASH_ANTISMASH {
+    tag "${meta.id}"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "nf-core/antismash:8.0.1--pyhdfd78af_0"
+
+    input:
+    tuple val(meta), path(sequence_input)
+    path databases
+    path gff
+
+    output:
+    tuple val(meta), path("${prefix}/{css,images,js}")                    , emit: html_accessory_files
+    tuple val(meta), path("${prefix}/*.gbk")                              , emit: gbk_input
+    tuple val(meta), path("${prefix}/*.json")                             , emit: json_results
+    tuple val(meta), path("${prefix}/*.log")                              , emit: log
+    tuple val(meta), path("${prefix}/*.zip")                              , emit: zip
+    tuple val(meta), path("${prefix}/index.html")                         , emit: html
+    tuple val(meta), path("${prefix}/regions.js")                         , emit: json_sideloading
+    tuple val(meta), path("${prefix}/clusterblast/*_c*.txt")              , emit: clusterblast_file          , optional: true
+    tuple val(meta), path("${prefix}/knownclusterblast/region*/ctg*.html"), emit: knownclusterblast_html     , optional: true
+    tuple val(meta), path("${prefix}/knownclusterblast/")                 , emit: knownclusterblast_dir      , optional: true
+    tuple val(meta), path("${prefix}/knownclusterblast/*_c*.txt")         , emit: knownclusterblast_txt      , optional: true
+    tuple val(meta), path("${prefix}/svg/clusterblast*.svg")              , emit: svg_files_clusterblast     , optional: true
+    tuple val(meta), path("${prefix}/svg/knownclusterblast*.svg")         , emit: svg_files_knownclusterblast, optional: true
+    tuple val(meta), path("${prefix}/*region*.gbk")                       , emit: gbk_results                , optional: true
+    tuple val(meta), path("${prefix}/clusterblastoutput.txt")             , emit: clusterblastoutput         , optional: true
+    tuple val(meta), path("${prefix}/knownclusterblastoutput.txt")        , emit: knownclusterblastoutput    , optional: true
+    path "versions.yml"                                                   , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args   ?: ''
+    prefix   = task.ext.prefix ?: "${meta.id}"
+    gff_flag = gff ? "--genefinding-gff3 ${gff}" : ""
+
+    """
+    ## We specifically do not include on-the-fly annotations (--genefinding-tool none) as
+    ## this should be run as a separate module for versioning purposes
+
+    antismash \\
+        ${args} \\
+        ${gff_flag} \\
+        -c ${task.cpus} \\
+        --output-dir ${prefix} \\
+        --output-basename ${prefix} \\
+        --genefinding-tool none \\
+        --logfile ${prefix}/${prefix}.log \\
+        --databases ${databases} \\
+        ${sequence_input}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g')
+    END_VERSIONS
+    """
+
+    stub:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    mkdir -p ${prefix}/css
+    mkdir ${prefix}/images
+    mkdir ${prefix}/js
+    touch ${prefix}/NZ_CP069563.1.region001.gbk
+    touch ${prefix}/NZ_CP069563.1.region002.gbk
+    touch ${prefix}/css/bacteria.css
+    touch ${prefix}/genome.gbk
+    touch ${prefix}/genome.json
+    touch ${prefix}/genome.zip
+    touch ${prefix}/images/about.svg
+    touch ${prefix}/index.html
+    touch ${prefix}/js/antismash.js
+    touch ${prefix}/js/jquery.js
+    touch ${prefix}/regions.js
+    touch ${prefix}/test.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g')
+    END_VERSIONS
+    """
+}
Original file line number	Diff line number	Diff line change
`@@ -170,6 +170,8 @@ def combine_annotations(annotations_dir, genes_dir, output, threads):`
`170`	`170`	`combined_data[FASTA_COLUMN] = combined_data[FASTA_COLUMN].where(`
`171`	`171`	`mask, other=combined_data[FASTA_COLUMN + "2"]`
`172`	`172`	`)`
	`173`	`+ # TODO: fix the merge so it doesn't make this column`
	`174`	`+ combined_data = combined_data.drop(columns=FASTA_COLUMN + "2")`
`173`	`175`
`174`	`176`	`combined_data = convert_bit_scores_to_numeric(combined_data)`
`175`	`177`