diff --git a/.github/scripts/cache.sh b/.github/scripts/cache.sh deleted file mode 100755 index 371a3cb5b..000000000 --- a/.github/scripts/cache.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -l - -set -eu - -### VARIABLES ### - -# From action env: -# REPO_DIR - -ACT_SCRIPT_PATH=_visualize/scripts - -### SETUP ### - -# Store absolute path -cd $REPO_DIR -REPO_ROOT=$(pwd) - -### RUN CACHE SCRIPT ### - -cd $REPO_ROOT/$ACT_SCRIPT_PATH -./CACHE.sh - -exit 0 diff --git a/.github/scripts/update.sh b/.github/scripts/validate.sh old mode 100755 new mode 100644 similarity index 64% rename from .github/scripts/update.sh rename to .github/scripts/validate.sh index 7639407a8..dfd69ff98 --- a/.github/scripts/update.sh +++ b/.github/scripts/validate.sh @@ -6,30 +6,15 @@ set -eu # From action env: # REPO_DIR +# TAG -ACT_LOG_PATH=_visualize/LAST_MASTER_UPDATE.txt +ACT_LOG_PATH=_visualize/LAST_${TAG}_UPDATE.txt ACT_INPUT_PATH=_visualize ACT_DATA_PATH=visualize/github-data -ACT_SCRIPT_PATH=_visualize/scripts - -### SETUP ### - -# Store absolute path -cd $REPO_DIR -REPO_ROOT=$(pwd) - -# Store previous END timestamp -OLD_END=$(cat $ACT_LOG_PATH | grep END | cut -f 2) -OLD_END=$(date --date="$OLD_END" "+%s") - -### RUN MASTER SCRIPT ### - -cd $REPO_ROOT/$ACT_SCRIPT_PATH -./MASTER.sh ### VALIDATE UPDATE ### -cd $REPO_ROOT +cd $REPO_DIR # Timestamp log changed cat $ACT_LOG_PATH @@ -50,17 +35,6 @@ if [ $(cat $ACT_LOG_PATH | grep -c FAILED) -ne "0" ] || [ $(cat $ACT_LOG_PATH | echo "Timestamp log valid" fi -# New START is later than previous END -NEW_START=$(cat $ACT_LOG_PATH | grep START | cut -f 2) -NEW_START=$(date --date="$NEW_START" "+%s") -if [ "$OLD_END" -gt "$NEW_START" ] - then - echo "UPDATE FAILED - New START is earlier than previous END" - exit 1 - else - echo "START timestamp valid" -fi - # All changes are to valid files only git diff --name-only HEAD CHANGE_COUNT=$(git diff --name-only HEAD | grep -c -E ".+") diff --git a/.github/workflows/cache.yml b/.github/workflows/cache.yml index 89b01a204..cf848e2e6 100644 --- a/.github/workflows/cache.yml +++ b/.github/workflows/cache.yml @@ -3,12 +3,18 @@ name: Routine Data Cache Request on: workflow_dispatch: schedule: - - cron: "45 8 * * *" + - cron: '45 8 * * *' + +concurrency: + group: data-cache defaults: run: shell: bash +env: + TAG: CACHE + jobs: runDataUpdate: name: Run Cache Request @@ -20,35 +26,44 @@ jobs: - name: Store timestamp run: | echo "TIMESTAMP=$(date -u +"%F-%H")" >> "$GITHUB_ENV" + - name: Checkout uses: actions/checkout@v6 with: path: ${{ env.REPO_DIR }} token: ${{ secrets.GITHUB_TOKEN }} + - name: Setup python uses: actions/setup-python@v6 with: - python-version: "3.11" - cache: "pip" - cache-dependency-path: "${{ env.REPO_DIR }}/_visualize/scripts/requirements.txt" + python-version: '3.11' + cache: 'pip' + cache-dependency-path: '${{ env.REPO_DIR }}/_visualize/scripts/requirements.txt' + - name: Install dependencies - run: pip install -r ${{ env.REPO_DIR }}/_visualize/scripts/requirements.txt - - name: Run cache script - run: ./${{ env.REPO_DIR }}/.github/scripts/cache.sh + run: pip install -r $REPO_DIR/_visualize/scripts/requirements.txt + + - name: Run data collection script + run: | + set -eu + cd $REPO_DIR/_visualize/scripts + ./UPDATE.sh $TAG env: GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Show health stats if: ${{ always() }} run: | - cat ${{ env.REPO_DIR }}/_visualize/LAST_CACHE_REQUEST.txt || true - echo "Warning Count: $(grep -c 'Warning' ${{ env.REPO_DIR }}/_visualize/LAST_CACHE_REQUEST.log)" - echo "From Timeouts: $(grep -c 'but failed' ${{ env.REPO_DIR }}/_visualize/LAST_CACHE_REQUEST.log)" - echo "Limit Reached: $(grep -c 'rate limit exceeded' ${{ env.REPO_DIR }}/_visualize/LAST_CACHE_REQUEST.log)" + cat $REPO_DIR/_visualize/LAST_$TAG_UPDATE.txt || true + echo "Warning Count: $(grep -c 'Warning' $REPO_DIR/_visualize/LAST_$TAG_UPDATE.log)" + echo "From Timeouts: $(grep -c 'but failed' $REPO_DIR/_visualize/LAST_$TAG_UPDATE.log)" + echo "Limit Reached: $(grep -c 'rate limit exceeded' $REPO_DIR/_visualize/LAST_$TAG_UPDATE.log)" + - name: Save log files if: ${{ always() }} uses: actions/upload-artifact@v6 with: name: logfiles_${{ env.TIMESTAMP }}_cache path: | - ${{ env.REPO_DIR }}/_visualize/LAST_CACHE_REQUEST.txt - ${{ env.REPO_DIR }}/_visualize/LAST_CACHE_REQUEST.log + ${{ env.REPO_DIR }}/_visualize/LAST_${{ env.TAG }}_UPDATE.txt + ${{ env.REPO_DIR }}/_visualize/LAST_${{ env.TAG }}_UPDATE.log diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml index 1e2574a5c..d6b860e56 100644 --- a/.github/workflows/update.yml +++ b/.github/workflows/update.yml @@ -3,7 +3,11 @@ name: Routine Data Update on: workflow_dispatch: schedule: - - cron: "45 10 * * *" + - cron: '45 10 * * *' + +concurrency: + group: data-updates + cancel-in-progress: true defaults: run: @@ -20,35 +24,72 @@ jobs: - name: Store timestamp run: | echo "TIMESTAMP=$(date -u +"%F-%H")" >> "$GITHUB_ENV" + - name: Checkout uses: actions/checkout@v6 with: path: ${{ env.REPO_DIR }} token: ${{ secrets.GITHUB_TOKEN }} persist-credentials: false + - name: Setup python uses: actions/setup-python@v6 with: - python-version: "3.11" - cache: "pip" - cache-dependency-path: "${{ env.REPO_DIR }}/_visualize/scripts/requirements.txt" + python-version: '3.11' + cache: 'pip' + cache-dependency-path: '${{ env.REPO_DIR }}/_visualize/scripts/requirements.txt' + - name: Install dependencies - run: pip install -r ${{ env.REPO_DIR }}/_visualize/scripts/requirements.txt - - name: Run update script + run: pip install -r $REPO_DIR/_visualize/scripts/requirements.txt + + - name: Create GitHub App Installation Token1 + uses: actions/create-github-app-token@v2 + id: app-token1 + with: + app-id: ${{ vars.APP_ID }} + private-key: ${{ secrets.PRIVATE_KEY }} + + - name: Run data collection script with App Installation Token + run: | + set -eu + cd $REPO_DIR/_visualize/scripts + ./UPDATE.sh $TAG + env: + GITHUB_API_TOKEN: ${{ steps.app-token1.outputs.token }} + TAG: MEMBERS + + - name: Validate members data updates + run: ./$REPO_DIR/.github/scripts/validate.sh + env: + TAG: MEMBERS + + - name: Run data collection script with Action Token + run: | + set -eu + cd $REPO_DIR/_visualize/scripts + ./UPDATE.sh $TAG env: GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: ./${{ env.REPO_DIR }}/.github/scripts/update.sh + TAG: CORE + + - name: Validate core data updates + run: ./$REPO_DIR/.github/scripts/validate.sh + env: + TAG: CORE + - name: Create GitHub App Installation Token uses: actions/create-github-app-token@v2 id: app-token with: app-id: ${{ vars.APP_ID }} private-key: ${{ secrets.PRIVATE_KEY }} + - name: Get GitHub App User ID id: get-user-id env: GH_TOKEN: ${{ steps.app-token.outputs.token }} run: echo "user-id=$(gh api "/users/${{ steps.app-token.outputs.app-slug }}[bot]" --jq .id)" >> "$GITHUB_OUTPUT" + - name: Configure git env: GH_TOKEN: ${{ steps.app-token.outputs.token }} @@ -56,30 +97,37 @@ jobs: gh auth setup-git git config --global user.name '${{ steps.app-token.outputs.app-slug }}[bot]' git config --global user.email '${{ steps.get-user-id.outputs.user-id }}+${{ steps.app-token.outputs.app-slug }}[bot]@users.noreply.github.com' + - name: Commit updated data env: GH_TOKEN: ${{ steps.app-token.outputs.token }} run: | - pushd ${{ env.REPO_DIR }} + set -eu + cd $REPO_DIR git stash git pull --ff-only git stash pop git add -A . - git commit -m "${{ env.TIMESTAMP }} Data Update by ${{ steps.app-token.outputs.app-slug }}" + git commit -m "$TIMESTAMP Data Update by ${{ steps.app-token.outputs.app-slug }}" git push - popd + - name: Show health stats if: ${{ always() }} run: | - cat ${{ env.REPO_DIR }}/_visualize/LAST_MASTER_UPDATE.txt || true - echo "Warning Count: $(grep -c 'Warning' ${{ env.REPO_DIR }}/_visualize/LAST_MASTER_UPDATE.log)" - echo "From Timeouts: $(grep -c 'but failed' ${{ env.REPO_DIR }}/_visualize/LAST_MASTER_UPDATE.log)" - echo "Limit Reached: $(grep -c 'rate limit exceeded' ${{ env.REPO_DIR }}/_visualize/LAST_MASTER_UPDATE.log)" + for TAG in MEMBERS CORE; do + cat $REPO_DIR/_visualize/LAST_$TAG_UPDATE.txt || true + echo "Warning Count: $(grep -c 'Warning' $REPO_DIR/_visualize/LAST_$TAG_UPDATE.log)" + echo "From Timeouts: $(grep -c 'but failed' $REPO_DIR/_visualize/LAST_$TAG_UPDATE.log)" + echo "Limit Reached: $(grep -c 'rate limit exceeded' $REPO_DIR/_visualize/LAST_$TAG_UPDATE.log)" + done + - name: Save log files if: ${{ always() }} uses: actions/upload-artifact@v6 with: name: logfiles_${{ env.TIMESTAMP }}_update path: | - ${{ env.REPO_DIR }}/_visualize/LAST_MASTER_UPDATE.txt - ${{ env.REPO_DIR }}/_visualize/LAST_MASTER_UPDATE.log + ${{ env.REPO_DIR }}/_visualize/LAST_MEMBERS_UPDATE.txt + ${{ env.REPO_DIR }}/_visualize/LAST_MEMBERS_UPDATE.log + ${{ env.REPO_DIR }}/_visualize/LAST_CORE_UPDATE.txt + ${{ env.REPO_DIR }}/_visualize/LAST_CORE_UPDATE.log diff --git a/.gitignore b/.gitignore index fed18566a..0348c2908 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,6 @@ _site Gemfile.lock *.pyc _visualize/*.log -_visualize/LAST_CACHE_REQUEST.txt .DS_Store .vscode/ .bundle diff --git a/_visualize/LAST_MASTER_UPDATE.txt b/_visualize/LAST_FULL_UPDATE.txt similarity index 100% rename from _visualize/LAST_MASTER_UPDATE.txt rename to _visualize/LAST_FULL_UPDATE.txt diff --git a/_visualize/README.md b/_visualize/README.md index 893125338..68e2495e3 100644 --- a/_visualize/README.md +++ b/_visualize/README.md @@ -2,7 +2,7 @@ ```bash cd _visualize/scripts/ -./MASTER.sh +./UPDATE.sh ``` _(Additional script functionality detailed in the [`./scripts` section below][jump2 scripts].)_ @@ -29,7 +29,11 @@ New files are created for each type of data structure. For most files, data is overwritten each time the scripts are run. Other scripts may collect cumulative data with a daily timestamp. If one of these scripts is run multiple times in a single day, the entry for that day will be overwritten. -Running [`MASTER.sh`][mastersh] will run all of the necessary scripts in the appropriate order to fetch the latest data. It will also update [`LAST_MASTER_UPDATE.txt`][lastmasterup] to record when this complete data update was last run. +Running [`UPDATE.sh`][updatesh] will run all of the necessary scripts in the appropriate order to fetch the latest data. It will also update `LAST_FULL_UPDATE.txt` to record when this complete data update was last run. + +You can also run the script with an argument `UPDATE.sh ` to select a custom set of scripts defined in `UPDATE_.txt`. +(See [`UPDATE_FULL.txt`][updatefull] for the default set of scripts.) +Timestamps will also be recorded in `LAST__UPDATE.txt` instead. The scripts are only for gathering new data. You do not need them to run in order to view the webpage visualizations. @@ -39,8 +43,8 @@ The scripts are only for gathering new data. You do not need them to run in orde [queries dir]: queries [scripts dir]: scripts [requires]: scripts/requirements.txt -[mastersh]: scripts/MASTER.sh -[lastmasterup]: LAST_MASTER_UPDATE.txt +[updatesh]: scripts/UPDATE.sh +[updatefull]: scripts/UPDATE_FULL.txt [gitgraphql]: https://developer.github.com/v4/ [oauth]: https://github.com/settings/developers [personaltoken]: https://github.com/settings/tokens diff --git a/_visualize/scripts/CACHE.sh b/_visualize/scripts/CACHE.sh deleted file mode 100755 index 54429ccca..000000000 --- a/_visualize/scripts/CACHE.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# Run this script to trigger GitHub's data caching -# https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#best-practices-for-caching - -exec &> ../LAST_CACHE_REQUEST.log - -export GITHUB_DATA=../../visualize/github-data -DATELOG=../LAST_CACHE_REQUEST.txt - -# On exit -function finish { - # Log end time - echo -e "END\t$(date -u)" >> $DATELOG -} -trap finish EXIT - -# Stop and Log for failed scripts -function errorCheck() { - if [ $ret -ne 0 ]; then - echo "FAILED - $1" - echo -e "FAILED\t$1" >> $DATELOG - exit 1 - fi -} - -# Basic script run procedure -function runScript() { - echo "Run - $1" - python -u $1 - ret=$? - errorCheck "$1" -} - - -# Check Python requirements -runScript python_check.py - - -echo "RUNNING CACHE REQUEST SCRIPT" - -# Log start time -echo -e "$(date -u '+%F-%H')" > $DATELOG -echo -e "START\t$(date -u)" >> $DATELOG - - -# --- CHACHEABLE QUERIES --- -runScript cache_repos_activitycommits.py - - -echo "CACHE REQUEST COMPLETE" diff --git a/_visualize/scripts/MASTER.sh b/_visualize/scripts/MASTER.sh deleted file mode 100755 index 6827e7884..000000000 --- a/_visualize/scripts/MASTER.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# Run this script to refresh all data - -exec &> ../LAST_MASTER_UPDATE.log - -export GITHUB_DATA=../../visualize/github-data -DATELOG=../LAST_MASTER_UPDATE.txt - -# On exit -function finish { - # Log end time - echo -e "END\t$(date -u)" >> $DATELOG -} -trap finish EXIT - -# Stop and Log for failed scripts -function errorCheck() { - if [ $ret -ne 0 ]; then - echo "FAILED - $1" - echo -e "FAILED\t$1" >> $DATELOG - exit 1 - fi -} - -# Basic script run procedure -function runScript() { - echo "Run - $1" - python -u $1 - ret=$? - errorCheck "$1" -} - - -# Check Python requirements -runScript python_check.py - - -echo "RUNNING MASTER UPDATE SCRIPT" - -# Log start time -echo -e "$(date -u '+%F-%H')" > $DATELOG -echo -e "START\t$(date -u)" >> $DATELOG - - -# RUN THIS FIRST -runScript cleanup_inputs.py - - -# --- BASIC DATA --- -# Required before any other repo scripts (output used as repo list) -runScript get_repos_info.py -# Required before any other member scripts (output used as member list) -runScript get_internal_members.py - - -# --- EXTERNAL V INTERNAL --- -runScript get_members_extrepos.py -runScript get_repos_users.py - - -# --- ADDITIONAL REPO DETAILS --- -runScript get_repos_languages.py -runScript get_repos_topics.py -runScript get_repos_activitycommits.py -runScript get_repos_dependencies.py -runScript get_dependency_info.py - - -# --- HISTORY FOR ALL TIME --- -runScript get_repos_starhistory.py -runScript get_repos_releases.py -runScript get_repos_creationhistory.py - - -# RUN THIS LAST -runScript build_yearlist.py # Used in case of long term cumulative data - -runScript gather_repo_metadata.py # Generate simplified metadata file - - -echo "MASTER UPDATE COMPLETE" diff --git a/_visualize/scripts/README.md b/_visualize/scripts/README.md index 70a4681e4..8bfe6cf3a 100644 --- a/_visualize/scripts/README.md +++ b/_visualize/scripts/README.md @@ -15,5 +15,5 @@ source venv/bin/activate pip install -r requirements.txt # Run the collection script -./MASTER.sh +./UPDATE.sh ``` diff --git a/_visualize/scripts/UPDATE.sh b/_visualize/scripts/UPDATE.sh new file mode 100755 index 000000000..34a6ab97b --- /dev/null +++ b/_visualize/scripts/UPDATE.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Run this script to refresh data. +# Script selection can be customized through a text file UPDATE_.txt +# Providing the argument to this script will select UPDATE_.txt +# The input list from UPDATE_FULL.txt is selected by default. + +if [ -z "$1" ]; then + TAG=FULL +else + TAG="$1" +fi + +exec &> ../LAST_${TAG}_UPDATE.log + +export GITHUB_DATA=../../visualize/github-data +DATELOG=../LAST_${TAG}_UPDATE.txt + +# On exit +function finish { + # Log end time + echo -e "END\t$(date -u)" >> $DATELOG +} +trap finish EXIT + +# Stop and Log for failed scripts +function errorCheck() { + if [ $ret -ne 0 ]; then + echo "FAILED - $1" + echo -e "FAILED\t$1" >> $DATELOG + exit 1 + fi +} + +# Basic script run procedure +function runScript() { + echo "Run - $1" + python -u $1 + ret=$? + errorCheck "$1" +} + + +# Check Python requirements +runScript python_check.py + +echo "RUNNING ${TAG} UPDATE SCRIPT" + +# Log start time +echo -e "$(date -u '+%F-%H')" > $DATELOG +echo -e "START\t$(date -u)" >> $DATELOG + +# RUN THIS FIRST +runScript cleanup_inputs.py + +# DATA COLLECTION +readarray -t script_array < <(grep -v '^#' UPDATE_${TAG}.txt) +echo "Data scripts queued: (${script_array[*]})" +for datascript in "${script_array[@]}"; do + runScript ${datascript}.py +done + +echo "${TAG} UPDATE COMPLETE" diff --git a/_visualize/scripts/UPDATE_CACHE.txt b/_visualize/scripts/UPDATE_CACHE.txt new file mode 100644 index 000000000..e01af22a0 --- /dev/null +++ b/_visualize/scripts/UPDATE_CACHE.txt @@ -0,0 +1,2 @@ +# --- CHACHEABLE QUERIES --- +cache_repos_activitycommits diff --git a/_visualize/scripts/UPDATE_CORE.txt b/_visualize/scripts/UPDATE_CORE.txt new file mode 100644 index 000000000..d84c73af9 --- /dev/null +++ b/_visualize/scripts/UPDATE_CORE.txt @@ -0,0 +1,19 @@ +# --- BASIC DATA --- +get_repos_info +# Required before any other repo scripts (output used as repo list) +# --- EXTERNAL V INTERNAL --- +get_members_extrepos +get_repos_users +# --- ADDITIONAL REPO DETAILS --- +get_repos_languages +get_repos_topics +get_repos_activitycommits +get_repos_dependencies +get_dependency_info +# --- HISTORY FOR ALL TIME --- +get_repos_starhistory +get_repos_releases +get_repos_creationhistory +# --- ADDITIONAL DATA PROCESSING --- +gather_repo_metadata +# Generates simplified metadata file diff --git a/_visualize/scripts/UPDATE_FULL.txt b/_visualize/scripts/UPDATE_FULL.txt new file mode 100644 index 000000000..eb35ebb11 --- /dev/null +++ b/_visualize/scripts/UPDATE_FULL.txt @@ -0,0 +1,22 @@ +# DEFAULT +# --- BASIC DATA --- +get_repos_info +# Required before any other repo scripts (output used as repo list) +get_internal_members +# Required before any other member scripts (output used as member list) +# --- EXTERNAL V INTERNAL --- +get_members_extrepos +get_repos_users +# --- ADDITIONAL REPO DETAILS --- +get_repos_languages +get_repos_topics +get_repos_activitycommits +get_repos_dependencies +get_dependency_info +# --- HISTORY FOR ALL TIME --- +get_repos_starhistory +get_repos_releases +get_repos_creationhistory +# --- ADDITIONAL DATA PROCESSING --- +gather_repo_metadata +# Generates simplified metadata file diff --git a/_visualize/scripts/UPDATE_MEMBERS.txt b/_visualize/scripts/UPDATE_MEMBERS.txt new file mode 100644 index 000000000..76db534a2 --- /dev/null +++ b/_visualize/scripts/UPDATE_MEMBERS.txt @@ -0,0 +1,3 @@ +# --- BASIC DATA --- +get_internal_members +# Required before any other member scripts (output used as member list) diff --git a/_visualize/scripts/build_yearlist.py b/_visualize/scripts/build_yearlist.py deleted file mode 100644 index 422c4957a..000000000 --- a/_visualize/scripts/build_yearlist.py +++ /dev/null @@ -1,34 +0,0 @@ -from scraper.github import queryManager as qm -from os import environ as env -import os.path - -ghDataDir = env.get("GITHUB_DATA", "../github-data") -yearDict = {} - -# Gather all file name data -print("Checking GitHub data file names with year stamps...") -if not os.path.exists(ghDataDir): - raise FileNotFoundError("Directory path '%s' does not exist." % (ghDataDir)) -for file in os.listdir(ghDataDir): - if file.endswith(".json"): - nameSplit = file.split(".") - # Must have format "somePrefix.0000.json" - if not nameSplit[0] == "YEARS" and nameSplit[1].isdigit(): - prefix = nameSplit[0] - yearX = int(nameSplit[1]) - if prefix not in yearDict: - yearDict[prefix] = [] - yearDict[prefix].append(yearX) - -print("Sorting year data...") -# Remove duplicate years (though shouldn't be possible) and sort list -for prefix in yearDict.keys(): - yearList = yearDict[prefix] - yearList = list(set(yearList)) - yearList.sort() - yearDict[prefix] = yearList - -yearData = qm.DataManager("%s/YEARS.json" % ghDataDir, False) -yearData.fileSave(newline="\n") - -print("Done!\n") diff --git a/visualize/github-data/YEARS.json b/visualize/github-data/YEARS.json deleted file mode 100644 index 9e26dfeeb..000000000 --- a/visualize/github-data/YEARS.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file