|
| 1 | +import os |
| 2 | +import requests |
| 3 | +import json |
| 4 | +import click |
| 5 | + |
| 6 | +token = os.environ["GRAPH_API_KEY"] |
| 7 | +endpoint = r"https://api.github.com/graphql" |
| 8 | +headers = {"Authorization": "bearer {}".format(token)} |
| 9 | + |
| 10 | + |
| 11 | +def load_query_from_file(fname, repo_owner="numpy", repo_name="numpy"): |
| 12 | + """ |
| 13 | + Load an 'issue' query from file and set the target repository, where |
| 14 | + the target repository has the format: |
| 15 | + |
| 16 | + https://github.com/<repo_owner>/<repo_name> |
| 17 | + |
| 18 | + Parameters |
| 19 | + ---------- |
| 20 | + fname : str |
| 21 | + Path to a text file containing a valid issue query according to the |
| 22 | + GitHub GraphQL schema. |
| 23 | + repo_owner : str |
| 24 | + Owner of target repository on GitHub. Default is 'numpy'. |
| 25 | + repo_name : str |
| 26 | + Name of target repository on GitHub. Default is 'numpy'. |
| 27 | + |
| 28 | + Returns |
| 29 | + ------- |
| 30 | + query : str |
| 31 | + Query loaded from file in text form suitable for ``send_query``. |
| 32 | +
|
| 33 | + Notes |
| 34 | + ----- |
| 35 | + This function expects the query to have a specific form and will not work |
| 36 | + for general GitHub GraphQL queries. See ``examples/`` for some valid |
| 37 | + templated issue queries. |
| 38 | + """ |
| 39 | + with open(fname, "r") as fh: |
| 40 | + query = fh.read() |
| 41 | + # Set target repo from template |
| 42 | + query = query.replace("_REPO_OWNER_", repo_owner) |
| 43 | + query = query.replace("_REPO_NAME_", repo_name) |
| 44 | + return query |
| 45 | + |
| 46 | + |
| 47 | +def send_query(query, query_type, cursor=None): |
| 48 | + """ |
| 49 | + Send a GraphQL query via requests.post |
| 50 | +
|
| 51 | + No validation is done on the query before sending. GitHub GraphQL is |
| 52 | + supported with the `cursor` argument. |
| 53 | +
|
| 54 | + Parameters |
| 55 | + ---------- |
| 56 | + query : str |
| 57 | + The GraphQL query to be sent |
| 58 | + query_type : {"issues", "pullRequests"} |
| 59 | + The object being queried according to the GitHub GraphQL schema. |
| 60 | + Currently only issues and pullRequests are supported |
| 61 | + cursor : str, optional |
| 62 | + If given, then the cursor is injected into the query to support |
| 63 | + GitHub's GraphQL pagination. |
| 64 | +
|
| 65 | + Returns |
| 66 | + ------- |
| 67 | + dict |
| 68 | + The result of the query (json) parsed by `json.loads` |
| 69 | +
|
| 70 | + Notes |
| 71 | + ----- |
| 72 | + This is intended mostly for internal use within `get_all_responses`. |
| 73 | + """ |
| 74 | + # TODO: Expand this, either by parsing the query type from the query |
| 75 | + # directly or manually adding more query_types to the set |
| 76 | + if query_type not in {"issues", "pullRequests"}: |
| 77 | + raise ValueError( |
| 78 | + "Only 'issues' and 'pullRequests' queries are currently supported" |
| 79 | + ) |
| 80 | + # TODO: Generalize this |
| 81 | + # WARNING: The cursor injection depends on the specific structure of the |
| 82 | + # query, this is the main reason why query types are limited to issues/PRs |
| 83 | + if cursor is not None: |
| 84 | + cursor_insertion_key = query_type + "(" |
| 85 | + cursor_ind = query.find(cursor_insertion_key) + len(cursor_insertion_key) |
| 86 | + query = query[:cursor_ind] + f'after:"{cursor}", ' + query[cursor_ind:] |
| 87 | + # Build request payload |
| 88 | + payload = {'query' : ''.join(query.split('\n'))} |
| 89 | + response = requests.post(endpoint, json=payload, headers=headers) |
| 90 | + return json.loads(response.content) |
| 91 | + |
| 92 | +def get_all_responses(query, query_type): |
| 93 | + """ |
| 94 | + Helper function to bypass GitHub GraphQL API node limit. |
| 95 | + """ |
| 96 | + # Get data from a single response |
| 97 | + initial_data = send_query(query, query_type) |
| 98 | + data, last_cursor, total_count = parse_single_query(initial_data, query_type) |
| 99 | + print(f"Retrieving {len(data)} out of {total_count} values...") |
| 100 | + # Continue requesting data (with pagination) until all are acquired |
| 101 | + while len(data) < total_count: |
| 102 | + rdata = send_query(query, query_type, cursor=last_cursor) |
| 103 | + pdata, last_cursor, _ = parse_single_query(rdata, query_type) |
| 104 | + data.extend(pdata) |
| 105 | + print(f"Retrieving {len(data)} out of {total_count} values...") |
| 106 | + print("Done.") |
| 107 | + return data |
| 108 | + |
| 109 | +def parse_single_query(data, query_type): |
| 110 | + """ |
| 111 | + Parse the data returned by `send_query` |
| 112 | +
|
| 113 | + .. warning:: |
| 114 | + |
| 115 | + Like `send_query`, the logic here depends on the specific structure |
| 116 | + of the query (e.g. it must be an issue or PR query, and must have a |
| 117 | + total count). |
| 118 | + """ |
| 119 | + try: |
| 120 | + total_count = data['data']['repository'][query_type]['totalCount'] |
| 121 | + data = data['data']['repository'][query_type]['edges'] |
| 122 | + last_cursor = data[-1]['cursor'] |
| 123 | + except KeyError as e: |
| 124 | + print(data) |
| 125 | + raise e |
| 126 | + return data, last_cursor, total_count |
| 127 | + |
| 128 | + |
| 129 | +class GithubGrabber: |
| 130 | + """ |
| 131 | + Pull down data via the GitHub APIv.4 given a valid GraphQL query. |
| 132 | + """ |
| 133 | + |
| 134 | + def __init__(self, query_fname, query_type, repo_owner="numpy", repo_name="numpy"): |
| 135 | + """ |
| 136 | + Create an object to send/recv queries related to the issue tracker |
| 137 | + for the given repository via the GitHub API v.4. |
| 138 | +
|
| 139 | + The repository to query against is given by: |
| 140 | + https://github.com/<repo_owner>/<repo_name> |
| 141 | +
|
| 142 | + Parameters |
| 143 | + ---------- |
| 144 | + query_fname : str |
| 145 | + Path to a valid GraphQL query conforming to the GitHub GraphQL |
| 146 | + schema |
| 147 | + query_type : {"issues", "pullRequests"} |
| 148 | + Type of object that is being queried according to the GitHub GraphQL |
| 149 | + schema. Currently only "issues" and "pullRequests" are supported. |
| 150 | + repo_owner : str |
| 151 | + Repository owner. Default is "numpy" |
| 152 | + repo_name : str |
| 153 | + Repository name. Default is "numpy" |
| 154 | + """ |
| 155 | + self.query_fname = query_fname |
| 156 | + self.query_type = query_type # TODO: Parse this directly from query |
| 157 | + self.repo_owner = repo_owner |
| 158 | + self.repo_name = repo_name |
| 159 | + self.raw_data = None |
| 160 | + self.load_query() |
| 161 | + |
| 162 | + def load_query(self): |
| 163 | + self.query = load_query_from_file( |
| 164 | + self.query_fname, self.repo_owner, self.repo_name |
| 165 | + ) |
| 166 | + |
| 167 | + def get(self): |
| 168 | + """ |
| 169 | + Get JSON-formatted raw data from the query. |
| 170 | + """ |
| 171 | + self.raw_data = get_all_responses(self.query, self.query_type) |
| 172 | + |
| 173 | + def dump(self, outfile): |
| 174 | + """ |
| 175 | + Dump raw json to `outfile`. |
| 176 | + """ |
| 177 | + if not self.raw_data: |
| 178 | + raise ValueError("raw_data is currently empty, nothing to dump") |
| 179 | + |
| 180 | + with open(outfile, "w") as outf: |
| 181 | + json.dump(self.raw_data, outf) |
| 182 | + |
| 183 | + |
| 184 | +@click.command() |
| 185 | +@click.argument('repo_owner') |
| 186 | +@click.argument('repo_name') |
| 187 | +def main(repo_owner, repo_name): |
| 188 | + """Download and save issue and pr data for `repo_owner`/`repo_name`.""" |
| 189 | + # Download issue data |
| 190 | + issues = GithubGrabber( |
| 191 | + 'query_examples/issue_activity_since_date.gql', |
| 192 | + 'issues', |
| 193 | + repo_owner=repo_owner, |
| 194 | + repo_name=repo_name, |
| 195 | + ) |
| 196 | + issues.get() |
| 197 | + issues.dump(f"{repo_name}_issues.json") |
| 198 | + # Download PR data |
| 199 | + prs = GithubGrabber( |
| 200 | + 'query_examples/pr_data_query.gql', |
| 201 | + 'pullRequests', |
| 202 | + repo_owner=repo_owner, |
| 203 | + repo_name=repo_name, |
| 204 | + ) |
| 205 | + prs.get() |
| 206 | + prs.dump(f"{repo_name}_prs.json") |
| 207 | + |
| 208 | + |
| 209 | + |
| 210 | +if __name__ == "__main__": |
| 211 | + main() |
0 commit comments