Scrape TGA ARTG #69
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Scrape TGA ARTG | |
| # Scrapes the Australian Register of Therapeutic Goods (ARTG) from | |
| # https://www.tga.gov.au/resources/artg using undetected-chromedriver + Xvfb | |
| # to bypass Akamai CDN detection. | |
| # | |
| # Two operations are supported and can be run independently: | |
| # | |
| # tga-dates – Visits each individual ARTG product page to fill in the | |
| # RegistrationDate column for rows that don't have one yet. | |
| # Runs automatically every Sunday. Most runs finish in a few | |
| # hours once the initial backfill is done. | |
| # | |
| # tga-listings – Scrapes the paginated ARTG listing to add new product | |
| # entries. The start page is determined automatically from | |
| # the number of rows already in tga_artg.csv, so each run | |
| # continues seamlessly from where the last one stopped. | |
| # Use listings_max_pages to limit how many pages each run | |
| # covers (each page ≈ 25 records, ~10 s/page). | |
| # | |
| # Both operations are resumable: rows already collected are preserved and | |
| # the script skips IDs / dates that were already fetched. | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| run_dates: | |
| description: 'Fetch RegistrationDate for rows that are missing one' | |
| type: boolean | |
| default: true | |
| dates_delay: | |
| description: 'Seconds between product-page requests (dates job)' | |
| default: '2.0' | |
| run_listings: | |
| description: 'Scrape listing pages to add new ARTG entries' | |
| type: boolean | |
| default: true | |
| listings_max_pages: | |
| description: 'Max listing pages to scrape per run; leave blank for all (~3913)' | |
| default: '' | |
| schedule: | |
| # Run every day at 00:00 UTC | |
| - cron: '0 0 * * *' | |
| jobs: | |
| # --------------------------------------------------------------------------- | |
| # Job 1: scrape listing pages to add new ARTG entries | |
| # --------------------------------------------------------------------------- | |
| tga-listings: | |
| name: Scrape TGA ARTG listing pages | |
| if: > | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'workflow_dispatch' && inputs.run_listings == true) | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.ref }} | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.x' | |
| cache: 'pip' | |
| - name: Install Python dependencies | |
| run: pip install -r requirements.txt | |
| - name: Install Xvfb | |
| run: sudo apt-get install -y xvfb | |
| - name: Scrape TGA listing pages | |
| # Use a step-level timeout shorter than the 6h job limit so that when | |
| # scraping runs long the step times out gracefully and the commit step | |
| # below still gets to execute. continue-on-error ensures the job is | |
| # not marked as failed due to the timeout, so subsequent steps run | |
| # under their normal conditions. | |
| timeout-minutes: 350 | |
| continue-on-error: true | |
| run: | | |
| EXTRA_ARGS="" | |
| if [ -n "${{ inputs.listings_max_pages }}" ]; then | |
| EXTRA_ARGS="--tga-max-pages ${{ inputs.listings_max_pages }}" | |
| fi | |
| # --tga-start-page is intentionally omitted: the script auto-detects | |
| # the correct page from the existing row count in tga_artg.csv. | |
| python download_databases.py --tga $EXTRA_ARGS | |
| env: | |
| PYTHONUNBUFFERED: '1' | |
| - name: Commit updated CSV | |
| if: always() | |
| run: | | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git config user.name "github-actions[bot]" | |
| git add data/tga/tga_artg.csv | |
| git diff --staged --quiet && echo "No changes to commit" && exit 0 | |
| ROWS=$(python3 -c " | |
| import csv | |
| print(sum(1 for _ in csv.DictReader(open('data/tga/tga_artg.csv')))) | |
| ") | |
| git commit -m "Update TGA ARTG listings: ${ROWS} total rows [skip ci]" | |
| # Clean up any unstaged changes (e.g., .tmp files, logs) before rebase | |
| git add -A | |
| git diff --staged --quiet || git commit --amend --no-edit | |
| git pull --rebase | |
| git push | |
| - name: Update README with current statistics | |
| if: always() | |
| run: | | |
| python3 update_readme_tga_stats.py | |
| git add README.md | |
| git diff --staged --quiet && echo "No README changes" && exit 0 | |
| git commit -m "Update README with TGA statistics [skip ci]" | |
| git pull --rebase | |
| git push | |
| # --------------------------------------------------------------------------- | |
| # Job 2: fetch RegistrationDate for every row missing one | |
| # --------------------------------------------------------------------------- | |
| tga-dates: | |
| name: Fetch TGA registration dates | |
| # Run after tga-listings (if it ran) so that dates are fetched for any | |
| # newly added listings, and so the two jobs don't write to tga_artg.csv | |
| # at the same time and clobber each other's results. | |
| # needs: tga-listings | |
| # Run on schedule (after tga-listings completes), or on manual dispatch | |
| # when run_dates is true and tga-listings either succeeded or was skipped | |
| # (run_listings: false). Also always run when tga-listings failed or was | |
| # cancelled (e.g. hit the 6h job timeout) so that dates are collected for | |
| # any new rows added during a partial run — regardless of whether | |
| # run_dates was explicitly requested. | |
| if: inputs.run_dates == true | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| # Check out the branch that triggered the workflow so the commit | |
| # goes back to the right place (PR branch or main). | |
| ref: ${{ github.ref }} | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.x' | |
| cache: 'pip' | |
| - name: Install Python dependencies | |
| run: pip install -r requirements.txt | |
| - name: Fetch TGA dates | |
| timeout-minutes: 350 | |
| continue-on-error: true | |
| run: | | |
| python download_databases.py \ | |
| --tga-dates \ | |
| --tga-dates-delay ${{ inputs.dates_delay || '2.0' }} | |
| env: | |
| PYTHONUNBUFFERED: '1' | |
| - name: Commit updated CSV | |
| run: | | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git config user.name "github-actions[bot]" | |
| git add data/tga/tga_artg.csv | |
| git diff --staged --quiet && echo "No changes to commit" && exit 0 | |
| # Count how many rows now have a date | |
| WITH=$(python3 -c " | |
| import csv | |
| rows = list(csv.DictReader(open('data/tga/tga_artg.csv'))) | |
| print(sum(1 for r in rows if r.get('RegistrationDate','').strip())) | |
| ") | |
| TOTAL=$(python3 -c " | |
| import csv | |
| print(sum(1 for _ in csv.DictReader(open('data/tga/tga_artg.csv')))) | |
| ") | |
| git commit -m "Update TGA registration dates: ${WITH}/${TOTAL} rows have date [skip ci]" | |
| # Clean up any unstaged changes (e.g., .tmp files, logs) before rebase | |
| git add -A | |
| git diff --staged --quiet || git commit --amend --no-edit | |
| git pull --rebase | |
| git push | |
| - name: Update README with current statistics | |
| run: | | |
| python3 update_readme_tga_stats.py | |
| git add README.md | |
| git diff --staged --quiet && echo "No README changes" && exit 0 | |
| git commit -m "Update README with TGA statistics [skip ci]" | |
| git pull --rebase | |
| git push | |