Skip to content

Commit 6cfe2b6

Browse files
authored
Add languagetool_reviewdog.py script
1 parent fd877f3 commit 6cfe2b6

1 file changed

Lines changed: 185 additions & 0 deletions

File tree

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import json
4+
import os
5+
import re
6+
import subprocess
7+
from typing import Dict, List, Optional, Set, Tuple
8+
9+
import requests
10+
11+
12+
def sh(*args: str) -> str:
13+
return subprocess.check_output(args, text=True).strip()
14+
15+
16+
def offset_to_line_col(text: str, offset: int) -> Tuple[int, int]:
17+
# reviewdog wants 1-based line/column
18+
line = text.count("\n", 0, offset) + 1
19+
last_nl = text.rfind("\n", 0, offset)
20+
col = offset - (last_nl + 1) + 1
21+
return line, col
22+
23+
24+
def normalize_word(s: str) -> str:
25+
s = re.sub(r"^[\W_]+|[\W_]+$", "", s, flags=re.UNICODE)
26+
return s.lower()
27+
28+
29+
def load_dictionary(path: str) -> Set[str]:
30+
if not path or not os.path.exists(path):
31+
return set()
32+
words: Set[str] = set()
33+
with open(path, "r", encoding="utf-8") as f:
34+
for line in f:
35+
line = line.strip()
36+
if not line or line.startswith("#"):
37+
continue
38+
words.add(line.lower())
39+
return words
40+
41+
42+
def changed_files(base_sha: str, head_sha: str) -> List[str]:
43+
# list only changed files in the PR
44+
out = sh("git", "diff", "--name-only", base_sha, head_sha)
45+
files = [x.strip() for x in out.splitlines() if x.strip()]
46+
return files
47+
48+
49+
def is_text_file(path: str) -> bool:
50+
ext = os.path.splitext(path)[1].lower()
51+
return ext in {".md", ".txt", ".rst", ".adoc", ".asciidoc", ".tex"} or os.path.basename(path).lower() in {
52+
"readme", "readme.md", "readme.txt"
53+
}
54+
55+
56+
def lt_check(api_url: str, language: str, text: str) -> Dict:
57+
resp = requests.post(
58+
api_url,
59+
data={
60+
"language": language,
61+
"text": text,
62+
},
63+
timeout=60,
64+
)
65+
resp.raise_for_status()
66+
return resp.json()
67+
68+
69+
def main() -> int:
70+
ap = argparse.ArgumentParser()
71+
ap.add_argument("--api-url", required=True)
72+
ap.add_argument("--language", required=True)
73+
ap.add_argument("--base-sha", required=True)
74+
ap.add_argument("--head-sha", required=True)
75+
ap.add_argument("--dictionary", default=".languagetool/words.txt")
76+
ap.add_argument("--max-suggestions", type=int, default=3)
77+
args = ap.parse_args()
78+
79+
dict_words = load_dictionary(args.dictionary)
80+
81+
files = changed_files(args.base_sha, args.head_sha)
82+
files = [f for f in files if os.path.exists(f) and is_text_file(f)]
83+
84+
diagnostics: List[Dict] = []
85+
86+
for path in files:
87+
try:
88+
with open(path, "r", encoding="utf-8") as f:
89+
content = f.read()
90+
except UnicodeDecodeError:
91+
with open(path, "r", encoding="utf-8", errors="replace") as f:
92+
content = f.read()
93+
94+
if not content.strip():
95+
continue
96+
97+
try:
98+
result = lt_check(args.api_url, args.language, content)
99+
except Exception as e:
100+
# Emit a single diagnostic if the API call fails for a file
101+
diagnostics.append(
102+
{
103+
"message": f"LanguageTool API error for {path}: {e}",
104+
"location": {"path": path, "range": {"start": {"line": 1, "column": 1}}},
105+
"severity": "WARNING",
106+
}
107+
)
108+
continue
109+
110+
matches = result.get("matches", [])
111+
for m in matches:
112+
offset = int(m.get("offset", 0))
113+
length = int(m.get("length", 0))
114+
bad = content[offset : offset + length]
115+
116+
rule = m.get("rule", {}) or {}
117+
rule_id = rule.get("id") or "UNKNOWN_RULE"
118+
category = (rule.get("category", {}) or {}).get("id", "")
119+
120+
# Cheap custom dictionary support without modifying LT server:
121+
# if LT reports a spelling/typo-ish issue AND the token is in our dictionary -> ignore it.
122+
# (Most spelling problems show up in category TYPOS and/or rule ids containing MORFOLOGIK.)
123+
bad_norm = normalize_word(bad)
124+
if dict_words and bad_norm:
125+
looks_like_spelling = (category.upper() == "TYPOS") or ("MORFOLOGIK" in str(rule_id).upper())
126+
if looks_like_spelling and (bad_norm in dict_words):
127+
continue
128+
129+
start_line, start_col = offset_to_line_col(content, offset)
130+
end_line, end_col = offset_to_line_col(content, offset + max(length, 0))
131+
132+
# Suggestions (as rdjson "suggestions" with ranges)
133+
suggestions = []
134+
repls = m.get("replacements", []) or []
135+
for r in repls[: args.max_suggestions]:
136+
val = r.get("value")
137+
if not val:
138+
continue
139+
suggestions.append(
140+
{
141+
"range": {
142+
"start": {"line": start_line, "column": start_col},
143+
"end": {"line": end_line, "column": end_col},
144+
},
145+
"text": val,
146+
}
147+
)
148+
149+
code = {"value": rule_id}
150+
urls = rule.get("urls") or []
151+
if urls and isinstance(urls, list):
152+
u = urls[0].get("value")
153+
if u:
154+
code["url"] = u
155+
156+
diagnostics.append(
157+
{
158+
"message": m.get("message") or "LanguageTool finding",
159+
"location": {
160+
"path": path,
161+
"range": {
162+
"start": {"line": start_line, "column": start_col},
163+
"end": {"line": end_line, "column": end_col},
164+
},
165+
},
166+
"severity": "WARNING",
167+
"code": code,
168+
**({"suggestions": suggestions} if suggestions else {}),
169+
}
170+
)
171+
172+
rdjson = {
173+
"source": {
174+
"name": "LanguageTool",
175+
"url": "https://languagetool.org",
176+
},
177+
"diagnostics": diagnostics,
178+
}
179+
180+
print(json.dumps(rdjson))
181+
return 0
182+
183+
184+
if __name__ == "__main__":
185+
raise SystemExit(main())

0 commit comments

Comments
 (0)