Skip to content

Commit 8c58268

Browse files
committed
Use new commitfest API instead of scraping
1 parent 2dc05a4 commit 8c58268

2 files changed

Lines changed: 37 additions & 82 deletions

File tree

cfbot_commitfest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def pull_submissions(conn, commitfest_id):
2929
AND name = %s
3030
AND status = %s
3131
AND authors = %s
32-
AND last_email_time = %s AT TIME ZONE 'UTC'""",
32+
AND last_email_time = %s""",
3333
(
3434
commitfest_id,
3535
submission.id,
@@ -49,7 +49,7 @@ def pull_submissions(conn, commitfest_id):
4949
"""INSERT INTO submission (commitfest_id, submission_id,
5050
name, status, authors,
5151
last_email_time)
52-
VALUES (%s, %s, %s, %s, %s, %s AT TIME ZONE 'UTC')
52+
VALUES (%s, %s, %s, %s, %s, %s)
5353
ON CONFLICT (commitfest_id, submission_id) DO
5454
UPDATE
5555
SET name = EXCLUDED.name,

cfbot_commitfest_rpc.py

Lines changed: 35 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
#!/usr/bin/env python
22
#
33
# Routines that interface with the Commitfest app.
4-
# For now these use webscraping, but they could become real API calls.
4+
5+
from datetime import datetime
56

67
import cfbot_config
78
import cfbot_util
8-
import html
9-
10-
# from html.parser import HTMLParser
119
import re
1210

1311

@@ -108,89 +106,46 @@ def get_latest_patches_from_thread_url(thread_url):
108106
def get_thread_url_for_submission(commitfest_id, submission_id):
109107
"""Given a Commitfest ID and a submission ID, return the URL of the 'whole
110108
thread' page in the mailing list archives."""
111-
# find all the threads and latest message times
112-
result = None
113-
url = f"{cfbot_config.COMMITFEST_HOST}/patch/{submission_id}/"
114-
candidates = []
115-
candidate = None
116-
submission_page = cfbot_util.slow_fetch(url, none_for_404=True)
117-
118-
if submission_page is None:
109+
url = f"{cfbot_config.COMMITFEST_HOST}/api/v1/patches/{submission_id}/threads"
110+
data = cfbot_util.slow_fetch_json(url, none_for_404=True)
111+
112+
if data is None:
119113
return None
120114

121-
for line in submission_page.splitlines():
122-
groups = re.search(
123-
"""Latest at <a href="https://www.postgresql.org/message-id/([^"]+)">(2[^<]+)""",
124-
line,
125-
)
126-
if groups:
127-
candidate = (groups.group(2), groups.group(1))
128-
# we'll only take threads that are followed by evidence that there is at least one attachment
129-
groups = re.search("""Latest attachment .* <button type="button" """, line)
130-
if groups:
131-
candidates.append(candidate)
132-
# take the one with the most recent email
133-
if len(candidates) > 0:
134-
candidates.sort()
135-
result = "https://www.postgresql.org/message-id/flat/" + candidates[-1][1]
136-
return result
115+
# Filter to threads that have attachments, then pick the one with the most
116+
# recent message
117+
candidates = [
118+
(t["latest_message_time"], t["messageid"])
119+
for t in data["threads"]
120+
if t["has_attachment"]
121+
]
122+
123+
if not candidates:
124+
return None
125+
126+
candidates.sort()
127+
return "https://www.postgresql.org/message-id/flat/" + candidates[-1][1]
137128

138129

139130
def get_submissions_for_commitfest(commitfest_id):
140131
"""Given a Commitfest ID, return a list of Submission objects."""
141-
result = []
142-
# parser = HTMLParser()
143-
url = f"{cfbot_config.COMMITFEST_HOST}/{commitfest_id}/"
144-
state = None
145-
latest_email = None
146-
authors = ""
147-
td_count = 0
148-
body = cfbot_util.slow_fetch(url, True)
149-
if body is None:
132+
url = f"{cfbot_config.COMMITFEST_HOST}/api/v1/commitfests/{commitfest_id}/patches"
133+
data = cfbot_util.slow_fetch_json(url, none_for_404=True)
134+
135+
if data is None:
150136
return []
151-
for line in body.splitlines():
152-
# maybe it's easier to count rows and columns
153-
if re.search("<tr>", line):
154-
td_count = 0
155-
continue
156-
if re.search("<td[^>]*>", line):
157-
td_count += 1
158-
159-
groups = re.search('<a href="/patch/([0-9]+)/">([^<]+)</a>', line)
160-
if groups:
161-
submission_id = groups.group(1)
162-
name = html.unescape(groups.group(2))
163-
continue
164-
if td_count == 8:
165-
groups = re.search("<td>([^<]*)</td>", line)
166-
if groups:
167-
authors = groups.group(1)
168-
authors = re.sub(" *\\([^)]*\\)", "", authors)
169-
continue
170-
if td_count == 3:
171-
groups = re.search(
172-
'<td><span class="badge[^"]*">([^<]+)</span></td>',
173-
line,
174-
# '<td><span class="label label-[^"]*">([^<]+)</span></td>', line
175-
)
176-
if groups and not state:
177-
state = groups.group(1)
178-
continue
179-
groups = re.search('<td style="white-space: nowrap;" title="([^"]+)">', line)
180-
if groups:
181-
latest_email = groups.group(1)
182-
result.append(
183-
Submission(
184-
submission_id,
185-
commitfest_id,
186-
name,
187-
state,
188-
authors.split(", "),
189-
latest_email,
190-
)
191-
)
192-
state = None
193-
return result
137+
138+
return [
139+
Submission(
140+
p["id"],
141+
commitfest_id,
142+
p["name"],
143+
p["status"],
144+
p["authors"],
145+
p["last_email_time"],
146+
)
147+
for p in data["patches"]
148+
]
194149

195150

196151
def get_current_commitfests():

0 commit comments

Comments
 (0)