Merge pull request #1 from jarrodmillman/basic-package

jarrodmillman · web-flow · commit e2f4bb7cf368 · 2023-05-22T16:11:36.000-07:00
Add basic package for query.py script from devstats-data
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,33 @@
+name: lint
+
+on: [push, pull_request]
+
+jobs:
+  format:
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        os: [ubuntu]
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[lint]
+
+      - name: Lint
+        run: pre-commit run --all-files --show-diff-on-failure --color always
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,42 @@
+# Install pre-commit hooks via
+# pre-commit install
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-ast
+      - id: check-builtin-literals
+      - id: check-case-conflict
+      - id: check-json
+      - id: check-toml
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+
+  - repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.13.0
+    hooks:
+      - id: blacken-docs
+
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v2.7.1
+    hooks:
+      - id: prettier
+        files: \.(css|html|md|yml|yaml)
+        args: [--prose-wrap=preserve]
+
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.4.0
+    hooks:
+      - id: pyupgrade
+        args: [--py310-plus]
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,58 @@
+# Release process for `devstats`
+
+## Introduction
+
+Example `version`
+
+- 1.8.dev0 # development version of 1.8 (release candidate 1)
+- 1.8rc1 # 1.8 release candidate 1
+- 1.8rc2.dev0 # development version of 1.8 release candidate 2
+- 1.8 # 1.8 release
+- 1.9.dev0 # development version of 1.9 (release candidate 1)
+
+## Process
+
+- Update and review `CHANGELOG.md`:
+
+      gem install github_changelog_generator
+      github_changelog_generator -u scientific-python -p pydata --since-tag=<last tag>
+
+- Update `version` in `pyproject.toml`.
+
+- Commit changes:
+
+      git add pyproject.toml CHANGELOG.md
+      git commit -m 'Designate <version> release'
+
+- Add the version number (e.g., `1.2.0`) as a tag in git:
+
+      git tag -s [-u <key-id>] v<version> -m 'signed <version> tag'
+
+  If you do not have a gpg key, use -u instead; it is important for
+  Debian packaging that the tags are annotated
+
+- Push the new meta-data to github:
+
+      git push --tags origin main
+
+  where `origin` is the name of the `github.com:scientific-python/pydata
+  repository
+
+- Review the github release page:
+
+      https://github.com/scientific-python/pydata/releases
+
+- Publish on PyPi:
+
+      git clean -fxd
+      pip install -U build twine wheel
+      python -m build --sdist --wheel
+      twine upload -s dist/*
+
+- Update `version` in `pyproject.toml`.
+
+- Commit changes:
+
+      git add pyproject.toml
+      git commit -m 'Bump version'
+      git push origin main
diff --git a/devstats/__init__.py b/devstats/__init__.py
@@ -0,0 +1,211 @@
+import os
+import requests
+import json
+import click
+
+token = os.environ["GRAPH_API_KEY"]
+endpoint = r"https://api.github.com/graphql"
+headers = {"Authorization": "bearer {}".format(token)}
+
+
+def load_query_from_file(fname, repo_owner="numpy", repo_name="numpy"):
+    """
+    Load an 'issue' query from file and set the target repository, where
+    the target repository has the format:
+    
+    https://github.com/<repo_owner>/<repo_name>
+    
+    Parameters
+    ----------
+    fname : str
+        Path to a text file containing a valid issue query according to the 
+        GitHub GraphQL schema.
+    repo_owner : str
+        Owner of target repository on GitHub. Default is 'numpy'.
+    repo_name : str
+        Name of target repository on GitHub. Default is 'numpy'.
+    
+    Returns
+    -------
+    query : str
+        Query loaded from file in text form suitable for ``send_query``.
+
+    Notes
+    -----
+    This function expects the query to have a specific form and will not work
+    for general GitHub GraphQL queries. See ``examples/`` for some valid
+    templated issue queries.
+    """
+    with open(fname, "r") as fh:
+        query = fh.read()
+        # Set target repo from template
+        query = query.replace("_REPO_OWNER_", repo_owner)
+        query = query.replace("_REPO_NAME_", repo_name)
+    return query
+
+
+def send_query(query, query_type, cursor=None):
+    """
+    Send a GraphQL query via requests.post
+
+    No validation is done on the query before sending. GitHub GraphQL is
+    supported with the `cursor` argument.
+
+    Parameters
+    ----------
+    query : str
+        The GraphQL query to be sent
+    query_type : {"issues", "pullRequests"}
+        The object being queried according to the GitHub GraphQL schema.
+        Currently only issues and pullRequests are supported
+    cursor : str, optional
+        If given, then the cursor is injected into the query to support
+        GitHub's GraphQL pagination.
+
+    Returns
+    -------
+    dict
+        The result of the query (json) parsed by `json.loads`
+
+    Notes
+    -----
+    This is intended mostly for internal use within `get_all_responses`.
+    """
+    # TODO: Expand this, either by parsing the query type from the query
+    # directly or manually adding more query_types to the set
+    if query_type not in {"issues", "pullRequests"}:
+        raise ValueError(
+            "Only 'issues' and 'pullRequests' queries are currently supported"
+        )
+    # TODO: Generalize this
+    # WARNING: The cursor injection depends on the specific structure of the
+    # query, this is the main reason why query types are limited to issues/PRs
+    if cursor is not None:
+        cursor_insertion_key = query_type + "("
+        cursor_ind = query.find(cursor_insertion_key) + len(cursor_insertion_key)
+        query = query[:cursor_ind] + f'after:"{cursor}", ' + query[cursor_ind:]
+    # Build request payload
+    payload = {'query' : ''.join(query.split('\n'))}
+    response = requests.post(endpoint, json=payload, headers=headers)
+    return json.loads(response.content)
+
+def get_all_responses(query, query_type):
+    """
+    Helper function to bypass GitHub GraphQL API node limit.
+    """
+    # Get data from a single response
+    initial_data = send_query(query, query_type)
+    data, last_cursor, total_count = parse_single_query(initial_data, query_type)
+    print(f"Retrieving {len(data)} out of {total_count} values...")
+    # Continue requesting data (with pagination) until all are acquired
+    while len(data) < total_count:
+        rdata = send_query(query, query_type, cursor=last_cursor)
+        pdata, last_cursor, _ = parse_single_query(rdata, query_type)
+        data.extend(pdata)
+        print(f"Retrieving {len(data)} out of {total_count} values...")
+    print("Done.")
+    return data
+
+def parse_single_query(data, query_type):
+    """
+    Parse the data returned by `send_query`
+
+    .. warning::
+       
+       Like `send_query`, the logic here depends on the specific structure
+       of the query (e.g. it must be an issue or PR query, and must have a
+       total count).
+    """
+    try:
+        total_count = data['data']['repository'][query_type]['totalCount']
+        data = data['data']['repository'][query_type]['edges']
+        last_cursor = data[-1]['cursor']
+    except KeyError as e:
+        print(data)
+        raise e
+    return data, last_cursor, total_count
+
+
+class GithubGrabber:
+    """
+    Pull down data via the GitHub APIv.4 given a valid GraphQL query.
+    """
+
+    def __init__(self, query_fname, query_type, repo_owner="numpy", repo_name="numpy"):
+        """
+        Create an object to send/recv queries related to the issue tracker
+        for the given repository via the GitHub API v.4.
+
+        The repository to query against is given by:
+        https://github.com/<repo_owner>/<repo_name>
+
+        Parameters
+        ----------
+        query_fname : str
+            Path to a valid GraphQL query conforming to the GitHub GraphQL
+            schema
+        query_type : {"issues", "pullRequests"}
+            Type of object that is being queried according to the GitHub GraphQL
+            schema. Currently only "issues" and "pullRequests" are supported.
+        repo_owner : str
+            Repository owner. Default is "numpy"
+        repo_name : str
+            Repository name. Default is "numpy"
+        """
+        self.query_fname = query_fname
+        self.query_type = query_type  # TODO: Parse this directly from query
+        self.repo_owner = repo_owner
+        self.repo_name = repo_name
+        self.raw_data = None
+        self.load_query()
+
+    def load_query(self):
+        self.query = load_query_from_file(
+            self.query_fname, self.repo_owner, self.repo_name
+        )
+
+    def get(self):
+        """
+        Get JSON-formatted raw data from the query.
+        """
+        self.raw_data = get_all_responses(self.query, self.query_type)
+
+    def dump(self, outfile):
+        """
+        Dump raw json to `outfile`.
+        """
+        if not self.raw_data:
+            raise ValueError("raw_data is currently empty, nothing to dump")
+
+        with open(outfile, "w") as outf:
+            json.dump(self.raw_data, outf)
+
+
+@click.command()
+@click.argument('repo_owner')
+@click.argument('repo_name')
+def main(repo_owner, repo_name):
+    """Download and save issue and pr data for `repo_owner`/`repo_name`."""
+    # Download issue data
+    issues = GithubGrabber(
+        'query_examples/issue_activity_since_date.gql',
+        'issues',
+        repo_owner=repo_owner,
+        repo_name=repo_name,
+    )
+    issues.get()
+    issues.dump(f"{repo_name}_issues.json")
+    # Download PR data
+    prs = GithubGrabber(
+        'query_examples/pr_data_query.gql',
+        'pullRequests',
+        repo_owner=repo_owner,
+        repo_name=repo_name,
+    )
+    prs.get()
+    prs.dump(f"{repo_name}_prs.json")
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,33 @@
+[project]
+name = "devstats"
+version = "0.1rc0.dev0"
+requires-python = ">=3.10"
+description = "Developer tool for scientific Python libraries"
+license = {file = "LICENSE"}
+maintainers = [
+  {name = "Scientific Python", email = "devstats@discuss.scientific-python.org"}
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Programming Language :: Python"
+]
+dependencies = [
+  "click",
+  "requests",
+  "jupyter",
+  "notebook",
+  "numpy",
+  "networkx",
+]
+
+[project.scripts]
+query = "query.__main__:main"
+
+[project.optional-dependencies]
+lint = ["pre-commit >= 3.r32"]
+
+[project.urls]
+homepage = "https://github.com/scientific-python/devstats"
+
+[tool.setuptools.packages.find]
+include = ["devstats*"]