@@ -167,27 +167,27 @@ def word_regex(word):
167167 return r"(\b|(?<=[\-_]))" + re.escape(word) + r"\b"
168168
169169 noise_words = [
170- "subtitles?",
171- "subtitle",
172- "sub-titles",
173- "subbed",
174- "with subtitles?",
170+ "-handwritten",
171+ "-spoken",
172+ "=",
175173 "english patch",
176- "handwritten",
177174 "hand write",
178- "hand-written",
179175 "hand written",
180- "-handwritten",
181- "no voice",
182- "no spoken word",
183- "no speech",
176+ "hand-written",
177+ "handwritten",
184178 "instrumental",
185- "universal",
186179 "language",
187- "=",
180+ "no speech",
181+ "no spoken word",
182+ "no voice",
188183 "simple",
189184 "spoken",
190- "-spoken",
185+ "sub-titles",
186+ "subbed",
187+ "subtitle",
188+ "subtitles?",
189+ "universal",
190+ "with subtitles?",
191191 ]
192192
193193 # Combine all noise words into one regex
@@ -242,56 +242,56 @@ def normalize_language(raw_language):
242242
243243 # --- Try Alias Map ---
244244 ALIAS_MAP = {
245- "engrish": "English",
246- "english_handwritten": "English",
247- "enlgish": "English",
248245 "american english": "English",
249- "english - american": "English",
250246 "american": "English",
251- "uk english": "English",
252- "eglish": "English",
253- "egligh": "English",
254- "english (us)": "English",
255- "us-en": "English",
256- "sgn": "Sign languages",
257247 "anglais": "English",
258- "us english": "English",
259- "indian english": "English",
260- "hwbrew": "Hebrew",
261- "polska": "Polish",
262248 "bosanski": "Bosnian",
263- "український ": "Ukrainian ",
249+ "castellano ": "Spanish ",
264250 "chinese sub": "Chinese",
265- "spain": "Spanish",
266- "português e espanhol": "Multiple languages",
267- "русский": "Russian",
268251 "deutsch": "German",
269- "france": "French",
270- "francais": "French",
271- "italiano": "Italian",
272- "ilokano": "Ilokano",
273- "viẹetnamese": "Vietnamese",
274- "português": "Portuguese",
275- "pt_br": "Portuguese",
252+ "egligh": "English",
253+ "eglish": "English",
254+ "en_us es_es": "Multiple languages",
255+ "english & chinese subbed": "Multiple languages",
256+ "english (us)": "English",
257+ "english - american": "English",
258+ "english_handwritten": "English",
259+ "engrish": "English",
260+ "enlgish": "English",
276261 "espanol": "Spanish",
277- "castellano": "Spanish",
262+ "francais": "French",
263+ "france": "French",
278264 "greek": "Greek",
265+ "hwbrew": "Hebrew",
266+ "ilokano": "Ilokano",
267+ "indian english": "English",
268+ "italiano": "Italian",
279269 "mandarin": "Chinese",
280- "nederlands": "Dutch",
281- "swahili": "Swahili",
282- "no language (english)": "Undetermined",
283- "whatever we play it to be": "Undetermined",
284- "en_us es_es": "Multiple languages",
285- "english & chinese subbed": "Multiple languages",
286- "n/a": "Undetermined",
287- "none": "Undetermined",
288- "unknown": "Undetermined",
289- "no speech": "Undetermined",
290- "no spoken language": "Undetermined",
291270 "multi": "Multiple Languages",
292271 "multilanguage": "Multiple languages",
293272 "multiple": "Multiple Languages",
294273 "music": "Undetermined",
274+ "n/a": "Undetermined",
275+ "nederlands": "Dutch",
276+ "no language (english)": "Undetermined",
277+ "no speech": "Undetermined",
278+ "no spoken language": "Undetermined",
279+ "none": "Undetermined",
280+ "polska": "Polish",
281+ "português e espanhol": "Multiple languages",
282+ "português": "Portuguese",
283+ "pt_br": "Portuguese",
284+ "sgn": "Sign languages",
285+ "spain": "Spanish",
286+ "swahili": "Swahili",
287+ "uk english": "English",
288+ "unknown": "Undetermined",
289+ "us english": "English",
290+ "us-en": "English",
291+ "viẹetnamese": "Vietnamese",
292+ "whatever we play it to be": "Undetermined",
293+ "русский": "Russian",
294+ "український": "Ukrainian",
295295 }
296296 ALIAS_MAP = {normalize_key(k): v for k, v in ALIAS_MAP.items()}
297297
0 commit comments