-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmeilisearch-wiki.py
More file actions
81 lines (74 loc) · 2.78 KB
/
meilisearch-wiki.py
File metadata and controls
81 lines (74 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import re
import requests
import json
from pypinyin import lazy_pinyin, Style
title_regex = re.compile(r'[#]{1,6} ?.*')
index_regex = re.compile(r'[^a-z_A-z0-9-]')
def countFiles(root_path, total_files):
assert os.path.exists(root_path)
item_list = os.listdir(root_path)
for item in item_list:
next_path = os.path.join(root_path, item)
if os.path.isfile(next_path):
total_files.append(next_path)
else:
countFiles(next_path, total_files)
return total_files
def mdProcess(inputMD: str):
# 处理纯标题
if inputMD.find('\n') == -1:
return inputMD.replace("# ",'').replace(" ",'')
# 处理内容中的md符号,我们只要纯文本
regexes = [
r'[#]{1,6} (?=[^#\n]*?\n)',
r'\*{3,999} *\n',
r'\|-*?(?=\|-)', r'\|-*?\| {0,2}\n',
r'```(\n|javascript|python|js|py|md|markdown|php|lua|c|cpp|clike|ruby|clojure|rust|flow|css|shell|powershell|glsl|json|yml|yaml)\n?',
r'!\[.*?\]\(.*?\)( *\n)',
r'-{3,999} *\n',
'~~',
]
for regex in regexes:
inputMD = re.sub(regex, '', inputMD)
return inputMD
if __name__ == "__main__":
# wiki目录
working_dir = os.path.dirname(os.path.dirname(__file__))
# 获取所有文件
files_absolute = countFiles(working_dir, [])
# 挑选出非javadoc的markdown文件转为相对路径
files_relative = []
for each in files_absolute:
if each.endswith(".md"):
relative_path = each.replace(working_dir, '').replace('\\', '/')
if not relative_path.startswith("/javadoc"):
if not relative_path.endswith(".md") and relative_path.endswith("md"):
relative_path = relative_path[:-2] + '.md'
files_relative.append(relative_path)
# 读取所有文件
files_content = {}
for each in files_relative:
print("reading " + each)
with open(working_dir + each, 'r', encoding='utf-8') as f:
files_content[each] = f.read()
# 匹配标题
document = []
for path, content in files_content.items():
passage_titles = title_regex.findall(content)
if len(passage_titles) == 0:
print(path)
continue
id = re.sub(r'[^a-zA-Z0-9-_]', '', '_'.join(lazy_pinyin(path[1:].replace('/','-').replace('.md', ''))))
document.append({
'id': id,
'title': mdProcess(passage_titles[0]),
'path': path,
'titles': passage_titles,
'content': mdProcess(content)
})
# 发送结果
dt = json.dumps(document)
KEY = os.getenv("Meili_API_Key")
result = requests.post(url='http://49.232.170.120:7700/indexes/wiki/documents', data=dt, headers={'Content-Type':'application/json', 'X-Meili-API-Key': KEY})
print("OK:" + result.text)