Skip to content

Commit 3659fda

Browse files
Merge pull request #480 from WrightonLabCSU/feature/dram-ag-distill
Feature/dram ag distill with new rule parser for distill, traits, and product
2 parents 348e276 + e523dd3 commit 3659fda

39 files changed

Lines changed: 1716 additions & 3899 deletions

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "bin/rule_parser"]
2+
path = bin/rule_parser
3+
url = https://github.com/WrightonLabCSU/Rule-Parser

CHANGELOG.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,43 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## 2.0.0-beta23 - 2026-02-01
6+
7+
[348e276](https://github.com/WrightonLabCSU/DRAM/commit/348e2764de7b666c0f6dcc1d82deea67d96e439b)...[852e0ae](https://github.com/WrightonLabCSU/DRAM/commit/852e0ae18ae5976e8594e8f68ed1938f7eafce9d)
8+
9+
### Features
10+
11+
- Add new DRAM rule parser submodule for traits and distill ([5e9a088](https://github.com/WrightonLabCSU/DRAM/commit/5e9a08872b590c3ac372047db46ddbaa946b8814))
12+
13+
New python Lark base rule parser that defines rule grammer
14+
for traits grammer that can be reused for distill and product.
15+
This rule parser is more accurate and less error prone than
16+
the older traits parser completely custom coded. THis fixes
17+
a number of bugs where things were being double counted and not
18+
parsed correctly. This also allows distill, traits, and product
19+
eventually to all use the same rule parsing code. Rule parsing
20+
documentation can be found in docs/rules_parser.md or
21+
https://dramit.readthedocs.io/en/latest/rules_parser.html
22+
23+
Rule parser is implemented with polars instead of Pandas for a
24+
few reasons. It is a bit faster, and it allows lazy query planning.
25+
The annotation df is not currently lazy (eager rn), but with plans
26+
to allow it to be lazy. Lazy DataFrames can be more memory efficient
27+
by only loading the data/columns needed, and doing query optimization
28+
to speed up and require less memory for intermediate steps.
29+
- Update summarize eco Ag sheet to newly developed Ag sheet ([4937f1c](https://github.com/WrightonLabCSU/DRAM/commit/4937f1ce846b3910e5fe18264f9daca8221dd207))
30+
31+
32+
- Add carbon rules to traits, save traits as excel sheet ([02de665](https://github.com/WrightonLabCSU/DRAM/commit/02de6657261fb5517b3df20985285a6b2b392fa7))
33+
34+
35+
36+
### Package
37+
38+
- Package cleanup. Removing old files and cleanups ([c6bae64](https://github.com/WrightonLabCSU/DRAM/commit/c6bae64a787623721ff402a38afd94bab63eb12d))
39+
40+
41+
542
## 2.0.0-beta22 - 2025-12-17
643

744
[64da24e](https://github.com/WrightonLabCSU/DRAM/commit/64da24e51367feaffe2f643dfbb0aa602e28c5c5)...[0933424](https://github.com/WrightonLabCSU/DRAM/commit/093342450e6e49c1a706cdf2c181bd931bab7a6d)

bin/adjectives.py

Lines changed: 117 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,10 @@
33
import os
44
import ast
55
import click
6+
import polars as pl
67

7-
import pandas as pd
8-
9-
from rule_adjectives.rule_graph import RuleParser, get_positive_genes
10-
from rule_adjectives.annotations import Annotations
11-
from utils.click_utils import validate_comma_separated
12-
8+
from rule_parser.src.rules import evaluate_rules_on_anno
9+
from utils.excel import write_summarized_genomes_to_xlsx
1310

1411
class PythonLiteralOption(click.Option):
1512

@@ -20,7 +17,7 @@ def type_cast_value(self, ctx, value):
2017
raise click.BadParameter(value)
2118

2219

23-
def get_package_path(local_path):
20+
def get_assets_path(local_path):
2421
"""
2522
Locate the package data or non python files
2623
@@ -29,117 +26,135 @@ def get_package_path(local_path):
2926
"""
3027
abs_snake_path = os.path.join(os.path.dirname(
3128
os.path.abspath(__file__)),
32-
"rule_adjectives",
29+
"assets",
3330
local_path)
3431
return abs_snake_path
3532

3633

37-
def list_adjectives(ctx, param, value):
38-
if not value or ctx.resilient_parsing:
39-
return
40-
rules = RuleParser(get_package_path('rules.tsv'), verbose=False)
41-
print("In the current rules file, these adjectives are available:")
42-
for i in rules.data.index[~rules.data['name'].isna()].unique():
43-
print(i)
34+
# def list_adjectives(ctx, param, value):
35+
# if not value or ctx.resilient_parsing:
36+
# return
37+
# rules = RuleParser(get_assets_path('traits_rules.tsv'), verbose=False)
38+
# print("In the current rules file, these adjectives are available:")
39+
# for i in rules.data.index[~rules.data['name'].isna()].unique():
40+
# print(i)
4441

4542

46-
def list_adjective_name(ctx, param, value):
47-
if not value or ctx.resilient_parsing:
48-
return
49-
rules = RuleParser(get_package_path('rules.tsv'), verbose=False)
50-
print("In the current rules file, these adjectives are available:")
51-
for i in rules.data['name'].unique():
52-
print(i)
43+
# def list_adjective_name(ctx, param, value):
44+
# if not value or ctx.resilient_parsing:
45+
# return
46+
# rules = RuleParser(get_assets_path('traits_rules.tsv'), verbose=False)
47+
# print("In the current rules file, these adjectives are available:")
48+
# for i in rules.data['name'].unique():
49+
# print(i)
5350

54-
def show_rules_path(ctx, param, value):
55-
if not value or ctx.resilient_parsing:
56-
return
57-
print(get_package_path('rules.tsv'))
51+
# def show_rules_path(ctx, param, value):
52+
# if not value or ctx.resilient_parsing:
53+
# return
54+
# print(get_assets_path('traits_traits_rules.tsv'))
5855

5956

6057
@click.command()
6158
@click.option('--annotations', type=click.Path(exists=True), required=True, help="One of only 2 required files. Path to a DRAM annotations file.")
62-
@click.option('-o', '--output', type=click.Path(), default='adjectives.tsv', help="Path for the output table. A true false table created by this script.")
63-
@click.option('-a', '--adjectives_list', default="", callback=validate_comma_separated, help="A comma seperated list of adjectives ('adj1,adj2,adj3'), by name, to evaluate. This limits the number of adjectives that are evaluated and is faster.")
64-
@click.option('-p', '--plot_adjectives', multiple=True, default=[], help="A list of adjectives, by name, to plot. This limits the number of adjectives that are plotted and is probably needed for speed.")
65-
@click.option('-g', '--plot_genomes', multiple=True,
66-
default=[], )
67-
@click.option('--plot_path', type=click.Path(exists=False),
68-
default=None,
69-
help='will become a folder of output plots, no path no plots.')
70-
@click.option('--strainer_tsv', type=click.Path(exists=False), default=None, help='The path for a tsv that will pass to strainer to filter genes. The only option at this time is pgtb for positive genes that are on true bugs.')
71-
@click.option('--strainer_type', type=click.Path(exists=False), default=None, help='The type of process that should make the strainer file.')
72-
@click.option('--debug_ids_by_fasta_to_tsv', type=click.Path(exists=False), default=None,
73-
help='This is a tool to debug the list of IDs found by DRAM it is mostly for experts')
59+
@click.option('-o', '--output', type=click.Path(), default='traits.xlsx', help="Path for the output table. A true false table created by this script.")
7460
@click.option('--rules_tsv', type=click.Path(exists=True),
75-
default=get_package_path('rules.tsv'),
61+
default=get_assets_path('traits_rules.tsv'),
7662
help="This is an optional path to a rules file with strict formatting. It will over write the original rules file that is stored with the script.")
77-
@click.option('--show_rules_path', is_flag=True, callback=show_rules_path,
78-
expose_value=False, is_eager=True,
79-
help="Show the path to the default rules path.")
80-
@click.option('--list_name', is_flag=True, callback=list_adjective_name,
81-
expose_value=False, is_eager=True,
82-
help="List the names for all adjectives.tsv that are"
83-
" available, you can pass these names to limit the"
84-
" adjectives that are evaluated")
85-
@click.option('--list_id', is_flag=True, callback=list_adjectives,
86-
expose_value=False, is_eager=True,
87-
help="List the names for all adjectives.tsv that are"
88-
" available, you can pass these names to limit the"
89-
" adjectives that are evaluated")
63+
# @click.option('-a', '--adjectives_list', default="", callback=validate_comma_separated, help="A comma seperated list of adjectives ('adj1,adj2,adj3'), by name, to evaluate. This limits the number of adjectives that are evaluated and is faster.")
64+
# @click.option('-p', '--plot_adjectives', multiple=True, default=[], help="A list of adjectives, by name, to plot. This limits the number of adjectives that are plotted and is probably needed for speed.")
65+
# @click.option('-g', '--plot_genomes', multiple=True,
66+
# default=[], )
67+
# @click.option('--plot_path', type=click.Path(exists=False),
68+
# default=None,
69+
# help='will become a folder of output plots, no path no plots.')
70+
# @click.option('--strainer_tsv', type=click.Path(exists=False), default=None, help='The path for a tsv that will pass to strainer to filter genes. The only option at this time is pgtb for positive genes that are on true bugs.')
71+
# @click.option('--strainer_type', type=click.Path(exists=False), default=None, help='The type of process that should make the strainer file.')
72+
# @click.option('--debug_ids_by_fasta_to_tsv', type=click.Path(exists=False), default=None,
73+
# help='This is a tool to debug the list of IDs found by DRAM it is mostly for experts')
74+
# @click.option('--show_rules_path', is_flag=True, callback=show_rules_path,
75+
# expose_value=False, is_eager=True,
76+
# help="Show the path to the default rules path.")
77+
# @click.option('--list_name', is_flag=True, callback=list_adjective_name,
78+
# expose_value=False, is_eager=True,
79+
# help="List the names for all adjectives.tsv that are"
80+
# " available, you can pass these names to limit the"
81+
# " adjectives that are evaluated")
82+
# @click.option('--list_id', is_flag=True, callback=list_adjectives,
83+
# expose_value=False, is_eager=True,
84+
# help="List the names for all adjectives.tsv that are"
85+
# " available, you can pass these names to limit the"
86+
# " adjectives that are evaluated")
9087
# @click.argument('-p', type=click.Path(exists=True))
9188
def evaluate(annotations:str, output:str,
92-
rules_tsv:str=get_package_path('rules.tsv'),
93-
adjectives_list:list=None, plot_adjectives:list=None,
94-
plot_genomes:list=None,plot_path:str=None,
95-
debug_ids_by_fasta_to_tsv:str=None,
96-
strainer_tsv:str=None, strainer_type='pgtb'):
89+
rules_tsv:str=get_assets_path('traits_rules.tsv'),
90+
# adjectives_list:list=None, plot_adjectives:list=None,
91+
# plot_genomes:list=None,plot_path:str=None,
92+
# debug_ids_by_fasta_to_tsv:str=None,
93+
# strainer_tsv:str=None, strainer_type='pgtb'
94+
):
9795
"""Using a DRAM annotations file make a table of adjectives."""
98-
rules = RuleParser(rules_tsv, verbose=False, adjectives=adjectives_list)
99-
annotations = Annotations(annotations)
100-
adjectives = rules.check_genomes(annotations)
101-
if debug_ids_by_fasta_to_tsv is not None:
102-
annotations.ids_by_fasta.to_csv(debug_ids_by_fasta_to_tsv, sep='\t')
103-
exit()
104-
adjectives.to_csv(output, sep='\t')
105-
# annotations.ids_by_fasta.iloc[1]['annotations']
106-
if plot_path is not None:
107-
rules.plot_cause(plot_path, adjectives=plot_adjectives,
108-
genomes=plot_genomes, show_steps=False
109-
)
110-
if strainer_tsv is not None:
111-
strainer_data = get_positive_genes(rules, annotations, adjectives)
112-
strainer_data.to_csv(strainer_tsv, sep='\t')
113-
114-
115-
@click.command()
116-
@click.argument('plot_path', type=click.Path(exists=False),
117-
default=None)#, help='will become a folder of output plots, no path no plots.')
118-
@click.option('-a', '--adjectives', multiple=True, default=[], help="A list of adjectives, by name, to evaluate. This limits the number of adjectives that are evaluated and is faster.")
119-
@click.option('--rules_tsv', type=click.Path(exists=True),
120-
default=get_package_path('rules.tsv'),
121-
help='The path that will become a folder of output plots, no path no plots.') # , help='The rules file which adhere to strict formating' )
122-
@click.option('--list_name', is_flag=True, callback=list_adjective_name,
123-
expose_value=False, is_eager=True,
124-
help="List the names for all adjectives.tsv that are"
125-
" available, you can pass these names to limit the"
126-
" adjectives that are evaluated")
127-
@click.option('--show_rules_path', is_flag=True, callback=show_rules_path,
128-
expose_value=False, is_eager=True,
129-
help="Show the path to the default rules path.")
130-
@click.option('--list_id', is_flag=True, callback=list_adjectives,
131-
expose_value=False, is_eager=True,
132-
help="List the names for all adjectives.tsv that are"
133-
" available, you can pass these names to limit the"
134-
" adjectives that are evaluated")
135-
# @click.argument('-p', type=click.Path(exists=True))
136-
def rule_plot(rules_tsv:str=get_package_path('rules.tsv'),
137-
adjectives:list=None, plot_path:str=None):
138-
"""
139-
Using a DRAM annotations file make a table of adjectives.
140-
"""
141-
rules = RuleParser(rules_tsv, verbose=False, adjectives=adjectives)
142-
rules.plot_rule(plot_path)
96+
rules = pl.read_csv(rules_tsv, separator="\t")
97+
98+
dfs = {}
99+
sample_col = "input_fasta"
100+
group_by_col = "topic_ecosystem"
101+
if group_by_col not in rules.columns:
102+
rules = rules.with_columns(
103+
pl.lit("traits").alias(group_by_col)
104+
)
105+
for name, data in rules.group_by(group_by_col):
106+
dfs[name[0]] = evaluate_rules_on_anno(
107+
rules=data.lazy(),
108+
annotations_path=annotations,
109+
sample_col=sample_col,
110+
)
111+
write_summarized_genomes_to_xlsx(
112+
df=None,
113+
output_file=output,
114+
group_by=group_by_col,
115+
sort_order_columns=sample_col,
116+
extra_frames=dfs
117+
)
118+
119+
# Maybe someday we will add all of this back in?
120+
121+
# if plot_path is not None:
122+
# rules.plot_cause(plot_path, adjectives=plot_adjectives,
123+
# genomes=plot_genomes, show_steps=False
124+
# )
125+
# if strainer_tsv is not None:
126+
# strainer_data = get_positive_genes(rules, annotations, adjectives)
127+
# strainer_data.to_csv(strainer_tsv, sep='\t')
128+
129+
130+
# @click.command()
131+
# @click.argument('plot_path', type=click.Path(exists=False),
132+
# default=None)#, help='will become a folder of output plots, no path no plots.')
133+
# @click.option('-a', '--adjectives', multiple=True, default=[], help="A list of adjectives, by name, to evaluate. This limits the number of adjectives that are evaluated and is faster.")
134+
# @click.option('--rules_tsv', type=click.Path(exists=True),
135+
# default=get_assets_path('traits_rules.tsv'),
136+
# help='The path that will become a folder of output plots, no path no plots.') # , help='The rules file which adhere to strict formating' )
137+
# @click.option('--list_name', is_flag=True, callback=list_adjective_name,
138+
# expose_value=False, is_eager=True,
139+
# help="List the names for all adjectives.tsv that are"
140+
# " available, you can pass these names to limit the"
141+
# " adjectives that are evaluated")
142+
# @click.option('--show_rules_path', is_flag=True, callback=show_rules_path,
143+
# expose_value=False, is_eager=True,
144+
# help="Show the path to the default rules path.")
145+
# @click.option('--list_id', is_flag=True, callback=list_adjectives,
146+
# expose_value=False, is_eager=True,
147+
# help="List the names for all adjectives.tsv that are"
148+
# " available, you can pass these names to limit the"
149+
# " adjectives that are evaluated")
150+
# # @click.argument('-p', type=click.Path(exists=True))
151+
# def rule_plot(rules_tsv:str=get_assets_path('traits_rules.tsv'),
152+
# adjectives:list=None, plot_path:str=None):
153+
# """
154+
# Using a DRAM annotations file make a table of adjectives.
155+
# """
156+
# rules = RuleParser(rules_tsv, verbose=False, adjectives=adjectives)
157+
# rules.plot_rule(plot_path)
143158

144159
if __name__ == "__main__":
145160
evaluate()

0 commit comments

Comments
 (0)