|
1 | | -import os |
2 | | -import requests |
3 | | -import sys |
4 | | -import json |
5 | | -import click |
6 | | -from glob import glob |
7 | | -import re |
8 | | - |
9 | | -try: |
10 | | - token = os.environ["GRAPH_API_KEY"] |
11 | | -except KeyError: |
12 | | - print("You need to set GRAPH_API_KEY") |
13 | | - print("But you shouldn't use this yet.") |
14 | | - sys.exit() |
15 | | - |
16 | | -endpoint = r"https://api.github.com/graphql" |
17 | | -headers = {"Authorization": f"bearer {token}"} |
18 | | - |
19 | | - |
20 | | -def load_query_from_file(fname, repo_owner="numpy", repo_name="numpy"): |
21 | | - """ |
22 | | - Load an 'issue' query from file and set the target repository, where |
23 | | - the target repository has the format: |
24 | | -
|
25 | | - https://github.com/<repo_owner>/<repo_name> |
26 | | -
|
27 | | - Parameters |
28 | | - ---------- |
29 | | - fname : str |
30 | | - Path to a text file containing a valid issue query according to the |
31 | | - GitHub GraphQL schema. |
32 | | - repo_owner : str |
33 | | - Owner of target repository on GitHub. Default is 'numpy'. |
34 | | - repo_name : str |
35 | | - Name of target repository on GitHub. Default is 'numpy'. |
36 | | -
|
37 | | - Returns |
38 | | - ------- |
39 | | - query : str |
40 | | - Query loaded from file in text form suitable for ``send_query``. |
41 | | -
|
42 | | - Notes |
43 | | - ----- |
44 | | - This function expects the query to have a specific form and will not work |
45 | | - for general GitHub GraphQL queries. See ``examples/`` for some valid |
46 | | - templated issue queries. |
47 | | - """ |
48 | | - with open(fname) as fh: |
49 | | - query = fh.read() |
50 | | - # Set target repo from template |
51 | | - query = query.replace("_REPO_OWNER_", repo_owner) |
52 | | - query = query.replace("_REPO_NAME_", repo_name) |
53 | | - return query |
54 | | - |
55 | | - |
56 | | -def send_query(query, query_type, cursor=None): |
57 | | - """ |
58 | | - Send a GraphQL query via requests.post |
59 | | -
|
60 | | - No validation is done on the query before sending. GitHub GraphQL is |
61 | | - supported with the `cursor` argument. |
62 | | -
|
63 | | - Parameters |
64 | | - ---------- |
65 | | - query : str |
66 | | - The GraphQL query to be sent |
67 | | - query_type : {"issues", "pullRequests"} |
68 | | - The object being queried according to the GitHub GraphQL schema. |
69 | | - Currently only issues and pullRequests are supported |
70 | | - cursor : str, optional |
71 | | - If given, then the cursor is injected into the query to support |
72 | | - GitHub's GraphQL pagination. |
73 | | -
|
74 | | - Returns |
75 | | - ------- |
76 | | - dict |
77 | | - The result of the query (json) parsed by `json.loads` |
78 | | -
|
79 | | - Notes |
80 | | - ----- |
81 | | - This is intended mostly for internal use within `get_all_responses`. |
82 | | - """ |
83 | | - # TODO: Expand this, either by parsing the query type from the query |
84 | | - # directly or manually adding more query_types to the set |
85 | | - if query_type not in {"issues", "pullRequests"}: |
86 | | - raise ValueError( |
87 | | - "Only 'issues' and 'pullRequests' queries are currently supported" |
88 | | - ) |
89 | | - # TODO: Generalize this |
90 | | - # WARNING: The cursor injection depends on the specific structure of the |
91 | | - # query, this is the main reason why query types are limited to issues/PRs |
92 | | - if cursor is not None: |
93 | | - cursor_insertion_key = query_type + "(" |
94 | | - cursor_ind = query.find(cursor_insertion_key) + len(cursor_insertion_key) |
95 | | - query = query[:cursor_ind] + f'after:"{cursor}", ' + query[cursor_ind:] |
96 | | - # Build request payload |
97 | | - payload = {"query": "".join(query.split("\n"))} |
98 | | - response = requests.post(endpoint, json=payload, headers=headers) |
99 | | - return json.loads(response.content) |
100 | | - |
101 | | - |
102 | | -def get_all_responses(query, query_type): |
103 | | - """ |
104 | | - Helper function to bypass GitHub GraphQL API node limit. |
105 | | - """ |
106 | | - # Get data from a single response |
107 | | - print(f"Retrieving first page...", end="", flush=True) |
108 | | - initial_data = send_query(query, query_type) |
109 | | - data, last_cursor, total_count = parse_single_query(initial_data, query_type) |
110 | | - |
111 | | - # Continue requesting data (with pagination) until all are acquired |
112 | | - while len(data) < total_count: |
113 | | - rdata = send_query(query, query_type, cursor=last_cursor) |
114 | | - pdata, last_cursor, _ = parse_single_query(rdata, query_type) |
115 | | - data.extend(pdata) |
116 | | - print( |
117 | | - f"OK\nRetrieving {len(data)} out of {total_count} values...", |
118 | | - end="", |
119 | | - flush=True, |
120 | | - ) |
121 | | - print("OK") |
122 | | - return data |
123 | | - |
124 | | - |
125 | | -def parse_single_query(data, query_type): |
126 | | - """ |
127 | | - Parse the data returned by `send_query` |
128 | | -
|
129 | | - .. warning:: |
130 | | -
|
131 | | - Like `send_query`, the logic here depends on the specific structure |
132 | | - of the query (e.g. it must be an issue or PR query, and must have a |
133 | | - total count). |
134 | | - """ |
135 | | - try: |
136 | | - total_count = data["data"]["repository"][query_type]["totalCount"] |
137 | | - data = data["data"]["repository"][query_type]["edges"] |
138 | | - last_cursor = data[-1]["cursor"] |
139 | | - except KeyError as e: |
140 | | - print(data) |
141 | | - raise e |
142 | | - return data, last_cursor, total_count |
143 | | - |
144 | | - |
145 | | -class GithubGrabber: |
146 | | - """ |
147 | | - Pull down data via the GitHub APIv.4 given a valid GraphQL query. |
148 | | - """ |
149 | | - |
150 | | - def __init__(self, query_fname, query_type, repo_owner="numpy", repo_name="numpy"): |
151 | | - """ |
152 | | - Create an object to send/recv queries related to the issue tracker |
153 | | - for the given repository via the GitHub API v.4. |
154 | | -
|
155 | | - The repository to query against is given by: |
156 | | - https://github.com/<repo_owner>/<repo_name> |
157 | | -
|
158 | | - Parameters |
159 | | - ---------- |
160 | | - query_fname : str |
161 | | - Path to a valid GraphQL query conforming to the GitHub GraphQL |
162 | | - schema |
163 | | - query_type : {"issues", "pullRequests"} |
164 | | - Type of object that is being queried according to the GitHub GraphQL |
165 | | - schema. Currently only "issues" and "pullRequests" are supported. |
166 | | - repo_owner : str |
167 | | - Repository owner. Default is "numpy" |
168 | | - repo_name : str |
169 | | - Repository name. Default is "numpy" |
170 | | - """ |
171 | | - self.query_fname = query_fname |
172 | | - self.query_type = query_type # TODO: Parse this directly from query |
173 | | - self.repo_owner = repo_owner |
174 | | - self.repo_name = repo_name |
175 | | - self.raw_data = None |
176 | | - self.load_query() |
177 | | - |
178 | | - def load_query(self): |
179 | | - self.query = load_query_from_file( |
180 | | - self.query_fname, self.repo_owner, self.repo_name |
181 | | - ) |
182 | | - |
183 | | - def get(self): |
184 | | - """ |
185 | | - Get JSON-formatted raw data from the query. |
186 | | - """ |
187 | | - self.raw_data = get_all_responses(self.query, self.query_type) |
188 | | - |
189 | | - def dump(self, outfile): |
190 | | - """ |
191 | | - Dump raw json to `outfile`. |
192 | | - """ |
193 | | - if not self.raw_data: |
194 | | - raise ValueError("raw_data is currently empty, nothing to dump") |
195 | | - |
196 | | - with open(outfile, "w") as outf: |
197 | | - print(f"Writing [{outfile}]") |
198 | | - json.dump(self.raw_data, outf) |
199 | | - |
200 | | - |
201 | | -@click.command() |
202 | | -@click.argument("repo_owner") |
203 | | -@click.argument("repo_name") |
204 | | -def main(repo_owner, repo_name): |
205 | | - """Download and save issue and pr data for `repo_owner`/`repo_name`.""" |
206 | | - |
207 | | - query_files = glob(os.path.join(os.path.dirname(__file__), "queries/*.gql")) |
208 | | - |
209 | | - for n, query in enumerate(query_files): |
210 | | - if n != 0: |
211 | | - print() |
212 | | - |
213 | | - print(f"Query: [{os.path.basename(query)}] on [{repo_owner}/{repo_name}]") |
214 | | - # Parse query type from gql |
215 | | - gql = open(query).read() |
216 | | - qtype_match = re.match( |
217 | | - r"query\s*{\s*repository\(.*?\)\s*{\s*(pullRequests|issues)", |
218 | | - gql, |
219 | | - flags=re.MULTILINE, |
220 | | - ) |
221 | | - if qtype_match is None: |
222 | | - print(f"Could not determine gql query type for {query}") |
223 | | - sys.exit(-1) |
224 | | - else: |
225 | | - qtype = qtype_match.group(1) |
226 | | - |
227 | | - qname, qext = os.path.splitext(query) |
228 | | - data = GithubGrabber( |
229 | | - query, |
230 | | - qtype, |
231 | | - repo_owner=repo_owner, |
232 | | - repo_name=repo_name, |
233 | | - ) |
234 | | - data.get() |
235 | | - ftype = {"issues": "issues", "pullRequests": "PRs"} |
236 | | - data.dump(f"{repo_name}_{ftype.get(qtype, qtype)}.json") |
237 | | - |
238 | | - |
239 | | -if __name__ == "__main__": |
240 | | - main() |
0 commit comments