Skip to content

Commit 7d9a12d

Browse files
feat: Switch hmmsearch to using PyHMMER search
PyHMMER has better parrallelism support, directly calling the lower level C bindings for HMMER and rewriting how it parallelizes. This means that when you had cpus=4 arg, it can 1/3 of the walltime with the exact same result.
1 parent efb3cc2 commit 7d9a12d

9 files changed

Lines changed: 73 additions & 13 deletions

File tree

bin/hmm_search.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env python
2+
import time
3+
import pyhmmer
4+
import click
5+
from pathlib import Path
6+
7+
alphabet = pyhmmer.easel.Alphabet.amino()
8+
9+
@click.command()
10+
@click.option(
11+
"--hmm",
12+
type=str,
13+
help="Path glob to the HMM db.",
14+
)
15+
@click.option(
16+
"--input_file",
17+
type=click.Path(exists=True),
18+
help="Path to the input fasta to search against",
19+
)
20+
@click.option(
21+
"--e_value",
22+
type=float,
23+
help="e value cutoff for filtering"
24+
)
25+
@click.option(
26+
"--output_file",
27+
type=click.Path(),
28+
help="Path to output file",
29+
)
30+
@click.option("--cpus", type=int, help="number of cpu core to run HMMER with")
31+
def main(hmm, input_file, e_value, output_file, cpus):
32+
t1 = time.time()
33+
34+
hmm = Path(hmm)
35+
36+
hmm_paths = hmm.parent.glob(hmm.name)
37+
38+
hmms = []
39+
for path in hmm_paths:
40+
with pyhmmer.plan7.HMMFile(path) as hmm_file:
41+
hmms.extend(hmm_file)
42+
43+
print(hmms)
44+
45+
with open(output_file, "wb") as out_fh:
46+
47+
with pyhmmer.easel.SequenceFile(input_file, digital=True, alphabet=alphabet) as sf:
48+
seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet)
49+
seqs.extend(sf)
50+
first = True
51+
for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value):
52+
hits.write(out_fh, format="domains", header=first)
53+
first=False
54+
#total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15))
55+
print(f"pyhmmer search completed in {time.time() - t1:.3} seconds")
56+
57+
58+
if __name__ == "__main__":
59+
main()

modules/local/annotate/add_sql_descriptions.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process ADD_SQL_DESCRIPTIONS {
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
input:
1010
tuple val(input_fasta), path(hits_file)

modules/local/annotate/combine_annotations.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process COMBINE_ANNOTATIONS {
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
input:
1010
path(fastas, stageAs: "annotations/*" )

modules/local/annotate/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ dependencies:
1010
- scikit-bio=0.7.1
1111
- scipy<2
1212
- click<9.0
13+
- pyhmmer

modules/local/annotate/gene_locs.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process GENE_LOCS {
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
tag { input_fasta }
1010

modules/local/annotate/hmmsearch.nf

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process HMM_SEARCH {
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
tag { input_fasta }
1010

@@ -24,12 +24,12 @@ process HMM_SEARCH {
2424
def ec_flag = ec_from_info ? "--ec_from_info" : ""
2525

2626
"""
27-
hmmsearch \\
28-
-E ${e_value} \\
29-
--domtblout ${input_fasta}_hmmsearch.out \\
30-
--cpu ${task.cpus} \\
31-
${database_loc}/*.hmm \\
32-
${fasta} > /dev/null
27+
hmm_search.py \\
28+
--hmm ${database_loc}/*.hmm \\
29+
--input_file ${fasta} \\
30+
--e_value ${e_value} \\
31+
--output_file ${input_fasta}_hmmsearch.out \\
32+
--cpus ${task.cpus}
3333
3434
hmm_parser.py \\
3535
--hmm_domtbl ${input_fasta}_hmmsearch.out \\

modules/local/annotate/merge_annotations.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process MERGE_ANNOTATIONS {
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
input:
1010
path( ch_annotations, stageAs: "annotations/*" )

modules/local/annotate/mmseqs_index.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process MMSEQS_INDEX{
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
tag { input_fasta }
1010

modules/local/annotate/mmseqs_search.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ process MMSEQS_SEARCH {
44
errorStrategy 'finish'
55

66
conda "${moduleDir}/environment.yml"
7-
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:d2c88b719ab1322c"
7+
container "community.wave.seqera.io/library/python_pandas_hmmer_mmseqs2_pruned:0a22b52d960467a9"
88

99
tag { input_fasta }
1010

0 commit comments

Comments
 (0)