Skip to content

Commit 174f397

Browse files
committed
Make SqlParse default parser for query masking and add execution tracking for masking parser (#26203)
* Make SqlParse default parser for query masking and add execution tracking for masking parser * Optimize masking parser pickup logic
1 parent 4fa3e02 commit 174f397

2 files changed

Lines changed: 33 additions & 23 deletions

File tree

ingestion/src/metadata/ingestion/lineage/masker.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"""
1212
Query masking utilities
1313
14-
All masking functions (SqlParse, SqlFluff, SqlGlot) reuse the already-parsed AST
14+
Masking functions (SqlParse, SqlFluff) reuse the already-parsed AST
1515
from the LineageRunner to avoid duplicate parsing and improve performance.
1616
"""
1717

@@ -21,7 +21,6 @@
2121

2222
from cachetools import LRUCache
2323
from collate_sqllineage.core.parser.sqlfluff.analyzer import SqlFluffLineageAnalyzer
24-
from collate_sqllineage.core.parser.sqlglot.analyzer import SqlGlotLineageAnalyzer
2524
from collate_sqllineage.core.parser.sqlparse.analyzer import SqlParseLineageAnalyzer
2625
from collate_sqllineage.runner import LineageRunner
2726
from sqlparse.sql import Comparison
@@ -130,6 +129,22 @@ def replace_literals(segment):
130129
return query
131130

132131

132+
@calculate_execution_time(context="GetSqlParseLineageRunner")
133+
def get_sqlparse_lineage_runner(query: str) -> LineageRunner:
134+
lr_sqlparse = LineageRunner(query, analyzer=SqlParseLineageAnalyzer)
135+
len(lr_sqlparse.source_tables)
136+
return lr_sqlparse
137+
138+
139+
@calculate_execution_time(context="GetSqlFluffLineageRunner")
140+
def get_sqlfluff_lineage_runner(query: str, dialect: str) -> LineageRunner:
141+
lr_sqlfluff = LineageRunner(
142+
query, dialect=dialect, analyzer=SqlFluffLineageAnalyzer
143+
)
144+
len(lr_sqlfluff.source_tables)
145+
return lr_sqlfluff
146+
147+
133148
@calculate_execution_time(context="MaskQuery")
134149
def mask_query(
135150
query: str,
@@ -159,7 +174,8 @@ def mask_query_impl(
159174
query_hash: Optional[str] = None,
160175
) -> Optional[str]:
161176
"""
162-
Mask a query using SqlGlot, SqlFluff, or SqlParse based on the analyzer used.
177+
Mask a query using SqlParse or SqlFluff.
178+
Only these two analyzers support literal masking (SqlGlot is excluded).
163179
"""
164180
hash_prefix = f"[{query_hash}] " if query_hash else ""
165181

@@ -170,34 +186,27 @@ def mask_query_impl(
170186
logger.debug(f"{hash_prefix}Query masking skipped as no parser available.")
171187
return None
172188

173-
masking_parser = parser
174-
# Since SqlGlot generalizes query structures/syntax, we will use
175-
# SqlParse for masking if SqlGlot is used for parsing
176-
if parser and isinstance(parser._analyzer, SqlGlotLineageAnalyzer):
177-
masking_parser = LineageRunner(query, analyzer=SqlParseLineageAnalyzer)
178-
len(masking_parser.source_tables)
189+
masking_parser = None
190+
191+
# Only reuse parser if it's already SqlParse or SqlFluff
192+
if parser and isinstance(
193+
parser._analyzer, (SqlParseLineageAnalyzer, SqlFluffLineageAnalyzer)
194+
):
195+
masking_parser = parser
179196

197+
# If no suitable parser, create one with fallback: SqlParse → SqlFluff
180198
if not masking_parser:
181-
# Try to create a parser with the same fallback strategy as LineageParser
182-
# but since we are not using SqlGlot for masking, we skip it here.
183-
# Try SqlFluff, then SqlParse
184-
# TODO: Evaluate if sqlparse should be the first choice here since it is
185-
# faster and almost same support as sqlfluff for masking literals.
186199
try:
187-
masking_parser = LineageRunner(
188-
query, dialect=dialect, analyzer=SqlFluffLineageAnalyzer
189-
)
190-
len(masking_parser.source_tables)
200+
masking_parser = get_sqlparse_lineage_runner(query)
191201
except Exception:
192-
masking_parser = LineageRunner(query, analyzer=SqlParseLineageAnalyzer)
193-
len(masking_parser.source_tables)
202+
masking_parser = get_sqlfluff_lineage_runner(query, dialect=dialect)
194203

195204
logger.debug(
196205
f"{hash_prefix}Query masking started using [{masking_parser._analyzer.__class__.__name__}]"
197206
f" for parser [{parser and parser._analyzer.__class__.__name__}]"
198207
)
199208

200-
# Check which analyzer was used based on _analyzer attribute
209+
# Dispatch to appropriate masking function
201210
if isinstance(masking_parser._analyzer, SqlFluffLineageAnalyzer):
202211
masked_query = mask_literals_with_sqlfluff(
203212
query, masking_parser, query_hash
@@ -208,7 +217,7 @@ def mask_query_impl(
208217
)
209218
else:
210219
logger.debug(
211-
f"{hash_prefix}Query masking skipped as no parser._analyzer available."
220+
f"{hash_prefix}Query masking skipped as no supported analyzer available."
212221
f" Analyzer: {masking_parser._analyzer}"
213222
)
214223
return None

ingestion/src/metadata/ingestion/lineage/parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from collate_sqllineage.core.models import Column, DataFunction, Location, Table
2626
from collate_sqllineage.core.parser.sqlfluff.analyzer import SqlFluffLineageAnalyzer
2727
from collate_sqllineage.core.parser.sqlglot.analyzer import SqlGlotLineageAnalyzer
28+
from collate_sqllineage.core.parser.sqlparse.analyzer import SqlParseLineageAnalyzer
2829
from collate_sqllineage.exceptions import SQLLineageException
2930
from collate_sqllineage.runner import LineageRunner
3031
from sqlparse.sql import Comparison, Identifier, Parenthesis, Statement
@@ -690,7 +691,7 @@ def get_sqlfluff_lineage_runner(query: str, dialect: str) -> LineageRunner:
690691
# context=self.query_hash,
691692
# )
692693
def get_sqlparse_lineage_runner(query: str) -> LineageRunner:
693-
lr_sqlparse = LineageRunner(query)
694+
lr_sqlparse = LineageRunner(query, analyzer=SqlParseLineageAnalyzer)
694695
lr_sqlparse.get_column_lineage()
695696
return lr_sqlparse
696697

0 commit comments

Comments
 (0)