|
10 | 10 | # limitations under the License. |
11 | 11 |
|
12 | 12 | """ |
13 | | -Query masking tests |
| 13 | +Query masking tests — core masking logic |
14 | 14 |
|
15 | 15 | Tests for masking SQL queries with different parsers (SqlGlot, SqlFluff, SqlParse). |
16 | | -Since all parsers now use SqlParse for masking, they should produce identical output. |
| 16 | +Covers: parser dispatch, caching, literal types, ordinal preservation edge cases. |
17 | 17 | """ |
18 | 18 | from unittest import TestCase |
19 | 19 |
|
20 | | -import pytest |
21 | | - |
22 | | -from ingestion.tests.unit.lineage.queries.helpers import assert_masked_query |
| 20 | +from ingestion.tests.unit.lineage.masker.helpers import assert_masked_query |
23 | 21 | from metadata.ingestion.lineage.masker import mask_query, masked_query_cache |
24 | 22 | from metadata.ingestion.lineage.models import Dialect |
25 | 23 |
|
@@ -290,174 +288,53 @@ def test_masking_when_no_parser_but_required(self): |
290 | 288 |
|
291 | 289 | assert masked_query is None |
292 | 290 |
|
293 | | - # Dialect specific query masking tests |
294 | | - |
295 | | - def test_postgres_typed_literals_nested_subquery(self): |
| 291 | + def test_group_by_order_by_ordinal_positions_preserved(self): |
296 | 292 | """ |
297 | | - Test masking of Postgres typed literals in nested subqueries. |
298 | | - """ |
299 | | - query_test_cases = [ |
300 | | - { |
301 | | - "query": "SELECT * FROM orders WHERE order_date = DATE '2023-10-01' AND customer_id IN (SELECT id FROM customers WHERE signup_date = TIMESTAMP '2022-01-15 10:30:00');", # noqa: E501 |
302 | | - "expected": "SELECT * FROM orders WHERE order_date = DATE ? AND customer_id IN (SELECT id FROM customers WHERE signup_date = TIMESTAMP ?);", # noqa: E501 |
303 | | - "dialect": Dialect.POSTGRES.value, |
304 | | - } |
305 | | - ] |
| 293 | + Test that integer ordinal positions in GROUP BY and ORDER BY clauses |
| 294 | + are NOT masked, while other literals (WHERE, HAVING, LIMIT, OFFSET) still are. |
306 | 295 |
|
307 | | - for test_case in query_test_cases: |
308 | | - assert_masked_query( |
309 | | - test_case["query"], |
310 | | - test_case["expected"], |
311 | | - test_case["dialect"], |
312 | | - "SqlGlot", |
313 | | - ) |
314 | | - assert_masked_query( |
315 | | - test_case["query"], |
316 | | - test_case["expected"], |
317 | | - test_case["dialect"], |
318 | | - "SqlFluff", |
319 | | - ) |
320 | | - assert_masked_query( |
321 | | - test_case["query"], |
322 | | - test_case["expected"], |
323 | | - test_case["dialect"], |
324 | | - "SqlParse", |
325 | | - ) |
326 | | - |
327 | | - def test_mysql_implicit_typing_functions_limits_column_if(self): |
328 | | - """ |
329 | | - Test masking of MySQL implicit typing functions like IF, LIMIT. |
| 296 | + This addresses a reported issue where positional references like |
| 297 | + GROUP BY 1, 2, 3 were being incorrectly masked to GROUP BY ?, ?, ? |
330 | 298 | """ |
331 | 299 | query_test_cases = [ |
332 | 300 | { |
333 | | - "query": "SELECT IF(status = 'active', 1, 0) AS is_active, DATE(created_at) AS created_day FROM accounts WHERE score > 99.5 AND created_at BETWEEN '2024-01-01' AND '2024-12-31' ORDER BY created_at DESC LIMIT 10 OFFSET 5;", # noqa: E501 |
334 | | - "expected": "SELECT IF(status = ?, ?, ?) AS is_active, DATE(created_at) AS created_day FROM accounts WHERE score > ? AND created_at BETWEEN ? AND ? ORDER BY created_at DESC LIMIT ? OFFSET ?;", # noqa: E501 |
335 | | - "dialect": Dialect.MYSQL.value, |
336 | | - } |
337 | | - ] |
338 | | - |
339 | | - for test_case in query_test_cases: |
340 | | - assert_masked_query( |
341 | | - test_case["query"], |
342 | | - test_case["expected"], |
343 | | - test_case["dialect"], |
344 | | - "SqlGlot", |
345 | | - ) |
346 | | - assert_masked_query( |
347 | | - test_case["query"], |
348 | | - test_case["expected"], |
349 | | - test_case["dialect"], |
350 | | - "SqlFluff", |
351 | | - ) |
352 | | - assert_masked_query( |
353 | | - test_case["query"], |
354 | | - test_case["expected"], |
355 | | - test_case["dialect"], |
356 | | - "SqlParse", |
357 | | - ) |
358 | | - |
359 | | - def test_bigquery_struct_array_unnest(self): |
360 | | - """ |
361 | | - Test masking of BigQuery STRUCTs, ARRAYs, and UNNEST. |
362 | | - """ |
363 | | - query_test_cases = [ |
| 301 | + "query": "SELECT a, b, c FROM t WHERE x > 5 GROUP BY 1, 2, 3 HAVING COUNT(*) > 1 ORDER BY 1 ASC LIMIT 500 OFFSET 0;", # noqa: E501 |
| 302 | + "expected": "SELECT a, b, c FROM t WHERE x > ? GROUP BY 1, 2, 3 HAVING COUNT(*) > ? ORDER BY 1 ASC LIMIT ? OFFSET ?;", # noqa: E501 |
| 303 | + "dialect": Dialect.ANSI.value, |
| 304 | + }, |
364 | 305 | { |
365 | | - "query": "SELECT u.name, u.age, a.city FROM UNNEST([STRUCT('alice' AS name, 25 AS age, [STRUCT('NY' AS city)])]) AS u, UNNEST(u.f2) AS a WHERE u.age > 21 AND a.city = 'NY';", # noqa: E501 |
366 | | - "expected": "SELECT u.name, u.age, a.city FROM UNNEST([STRUCT(? AS name, ? AS age, [STRUCT(? AS city)])]) AS u, UNNEST(u.f2) AS a WHERE u.age > ? AND a.city = ?;", # noqa: E501 |
367 | | - "dialect": Dialect.BIGQUERY.value, |
| 306 | + # Mixed column names and positional integers in GROUP BY / ORDER BY |
| 307 | + "query": "SELECT a, b FROM t GROUP BY a, 2 ORDER BY b, 1 DESC;", |
| 308 | + "expected": "SELECT a, b FROM t GROUP BY a, 2 ORDER BY b, 1 DESC;", |
| 309 | + "dialect": Dialect.ANSI.value, |
368 | 310 | }, |
369 | | - ] |
370 | | - |
371 | | - for test_case in query_test_cases: |
372 | | - # TODO: Not masking `'NY'` inside STRUCT whereas `'alice'` and `25` are masked, need to validate |
373 | | - # assert_masked_query( |
374 | | - # test_case["query"], |
375 | | - # test_case["expected"], |
376 | | - # test_case["dialect"], |
377 | | - # "SqlGlot", |
378 | | - # ) |
379 | | - assert_masked_query( |
380 | | - test_case["query"], |
381 | | - test_case["expected"], |
382 | | - test_case["dialect"], |
383 | | - "SqlFluff", |
384 | | - ) |
385 | | - # TODO: Not masking `'NY'` inside STRUCT whereas `'alice'` and `25` are masked, need to validate |
386 | | - # assert_masked_query( |
387 | | - # test_case["query"], |
388 | | - # test_case["expected"], |
389 | | - # test_case["dialect"], |
390 | | - # "SqlParse", |
391 | | - # ) |
392 | | - |
393 | | - def test_snowflake_variant_json_casting(self): |
394 | | - """ |
395 | | - Test masking of Snowflake VARIANT/JSON data types and casting. |
396 | | - """ |
397 | | - query_test_cases = [ |
398 | 311 | { |
399 | | - "query": "SELECT data:id AS user_id, data:profile.name AS user_name, data:profile.age::INT AS user_age FROM events WHERE data:profile.age > 30 AND data:profile.status = 'active';", # noqa: E501 |
400 | | - "expected": "SELECT data:id AS user_id, data:profile.name AS user_name, data:profile.age::INT AS user_age FROM events WHERE data:profile.age > ? AND data:profile.status = ?;", # noqa: E501 |
401 | | - "dialect": Dialect.SNOWFLAKE.value, |
402 | | - } |
403 | | - ] |
404 | | - |
405 | | - for test_case in query_test_cases: |
406 | | - assert_masked_query( |
407 | | - test_case["query"], |
408 | | - test_case["expected"], |
409 | | - test_case["dialect"], |
410 | | - "SqlGlot", |
411 | | - ) |
412 | | - assert_masked_query( |
413 | | - test_case["query"], |
414 | | - test_case["expected"], |
415 | | - test_case["dialect"], |
416 | | - "SqlFluff", |
417 | | - ) |
418 | | - assert_masked_query( |
419 | | - test_case["query"], |
420 | | - test_case["expected"], |
421 | | - test_case["dialect"], |
422 | | - "SqlParse", |
423 | | - ) |
424 | | - |
425 | | - @pytest.mark.skip( |
426 | | - reason="SqlGlot and SqlFluff do not support DECLARE statement type yet." |
427 | | - " Additionally multi-statement handling needs to be evaluated later." |
428 | | - ) |
429 | | - def test_tsql_variables_convert(self): |
430 | | - """ |
431 | | - Test masking of T-SQL variables and CONVERT function. |
432 | | - """ |
433 | | - query_test_cases = [ |
| 312 | + # CTE with GROUP BY positional references (similar to reported Payoneer query) |
| 313 | + "query": "WITH cte AS (SELECT a FROM t WHERE x = 'val' GROUP BY 1, 2, 3 HAVING COUNT(*) > 1 ORDER BY 1 ASC) SELECT * FROM cte LIMIT 500 OFFSET 0;", # noqa: E501 |
| 314 | + "expected": "WITH cte AS (SELECT a FROM t WHERE x = ? GROUP BY 1, 2, 3 HAVING COUNT(*) > ? ORDER BY 1 ASC) SELECT * FROM cte LIMIT ? OFFSET ?;", # noqa: E501 |
| 315 | + "dialect": Dialect.ANSI.value, |
| 316 | + }, |
434 | 317 | { |
435 | | - "query": "DECLARE @startDate DATETIME = '2024-01-01'; DECLARE @endDate DATETIME = '2024-12-31'; SELECT * FROM events WHERE event_date BETWEEN @startDate AND @endDate;", # noqa: E501 |
436 | | - "expected": "DECLARE @startDate DATETIME = '2024-01-01'; DECLARE @endDate DATETIME = '2024-12-31'; SELECT * FROM events WHERE event_date BETWEEN ? AND ?;", # noqa: E501 |
437 | | - "dialect": Dialect.TSQL.value, |
438 | | - } |
| 318 | + # BigQuery dialect with GROUP BY positional references |
| 319 | + "query": "SELECT full_name, COUNT(*) AS rn FROM t WHERE role = 'admin' AND dept IN ('a', 'b') GROUP BY 1, 2, 3, 4, 5, 6 HAVING COUNT(*) > 1 ORDER BY 1 ASC LIMIT 500 OFFSET 0;", # noqa: E501 |
| 320 | + "expected": "SELECT full_name, COUNT(*) AS rn FROM t WHERE role = ? AND dept IN (?, ?) GROUP BY 1, 2, 3, 4, 5, 6 HAVING COUNT(*) > ? ORDER BY 1 ASC LIMIT ? OFFSET ?;", # noqa: E501 |
| 321 | + "dialect": Dialect.BIGQUERY.value, |
| 322 | + }, |
439 | 323 | ] |
440 | 324 |
|
441 | 325 | for test_case in query_test_cases: |
442 | | - # TODO: sqlglot doesn't support analyzing statement type [declare] for |
443 | | - # SQL: DECLARE @startDate DATETIME = '2024-01-01'; |
444 | 326 | assert_masked_query( |
445 | 327 | test_case["query"], |
446 | 328 | test_case["expected"], |
447 | 329 | test_case["dialect"], |
448 | 330 | "SqlGlot", |
449 | 331 | ) |
450 | | - # TODO: sqlfluff doesn't support analyzing statement type [declare] for |
451 | | - # SQL: DECLARE @startDate DATETIME = '2024-01-01'; |
452 | 332 | assert_masked_query( |
453 | 333 | test_case["query"], |
454 | 334 | test_case["expected"], |
455 | 335 | test_case["dialect"], |
456 | 336 | "SqlFluff", |
457 | 337 | ) |
458 | | - # TODO: since our parser is designed to handle one sql statement at a |
459 | | - # time, it returns last masked statement only, need to evaluate later |
460 | | - # if multi-statement handling is required |
461 | 338 | assert_masked_query( |
462 | 339 | test_case["query"], |
463 | 340 | test_case["expected"], |
|
0 commit comments