Skip to content

Scrape TGA ARTG

Scrape TGA ARTG #69

Workflow file for this run

name: Scrape TGA ARTG
# Scrapes the Australian Register of Therapeutic Goods (ARTG) from
# https://www.tga.gov.au/resources/artg using undetected-chromedriver + Xvfb
# to bypass Akamai CDN detection.
#
# Two operations are supported and can be run independently:
#
# tga-dates – Visits each individual ARTG product page to fill in the
# RegistrationDate column for rows that don't have one yet.
# Runs automatically every Sunday. Most runs finish in a few
# hours once the initial backfill is done.
#
# tga-listings – Scrapes the paginated ARTG listing to add new product
# entries. The start page is determined automatically from
# the number of rows already in tga_artg.csv, so each run
# continues seamlessly from where the last one stopped.
# Use listings_max_pages to limit how many pages each run
# covers (each page ≈ 25 records, ~10 s/page).
#
# Both operations are resumable: rows already collected are preserved and
# the script skips IDs / dates that were already fetched.
on:
workflow_dispatch:
inputs:
run_dates:
description: 'Fetch RegistrationDate for rows that are missing one'
type: boolean
default: true
dates_delay:
description: 'Seconds between product-page requests (dates job)'
default: '2.0'
run_listings:
description: 'Scrape listing pages to add new ARTG entries'
type: boolean
default: true
listings_max_pages:
description: 'Max listing pages to scrape per run; leave blank for all (~3913)'
default: ''
schedule:
# Run every day at 00:00 UTC
- cron: '0 0 * * *'
jobs:
# ---------------------------------------------------------------------------
# Job 1: scrape listing pages to add new ARTG entries
# ---------------------------------------------------------------------------
tga-listings:
name: Scrape TGA ARTG listing pages
if: >
github.event_name == 'schedule' ||
(github.event_name == 'workflow_dispatch' && inputs.run_listings == true)
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.ref }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
cache: 'pip'
- name: Install Python dependencies
run: pip install -r requirements.txt
- name: Install Xvfb
run: sudo apt-get install -y xvfb
- name: Scrape TGA listing pages
# Use a step-level timeout shorter than the 6h job limit so that when
# scraping runs long the step times out gracefully and the commit step
# below still gets to execute. continue-on-error ensures the job is
# not marked as failed due to the timeout, so subsequent steps run
# under their normal conditions.
timeout-minutes: 350
continue-on-error: true
run: |
EXTRA_ARGS=""
if [ -n "${{ inputs.listings_max_pages }}" ]; then
EXTRA_ARGS="--tga-max-pages ${{ inputs.listings_max_pages }}"
fi
# --tga-start-page is intentionally omitted: the script auto-detects
# the correct page from the existing row count in tga_artg.csv.
python download_databases.py --tga $EXTRA_ARGS
env:
PYTHONUNBUFFERED: '1'
- name: Commit updated CSV
if: always()
run: |
git config user.email "github-actions[bot]@users.noreply.github.com"
git config user.name "github-actions[bot]"
git add data/tga/tga_artg.csv
git diff --staged --quiet && echo "No changes to commit" && exit 0
ROWS=$(python3 -c "
import csv
print(sum(1 for _ in csv.DictReader(open('data/tga/tga_artg.csv'))))
")
git commit -m "Update TGA ARTG listings: ${ROWS} total rows [skip ci]"
# Clean up any unstaged changes (e.g., .tmp files, logs) before rebase
git add -A
git diff --staged --quiet || git commit --amend --no-edit
git pull --rebase
git push
- name: Update README with current statistics
if: always()
run: |
python3 update_readme_tga_stats.py
git add README.md
git diff --staged --quiet && echo "No README changes" && exit 0
git commit -m "Update README with TGA statistics [skip ci]"
git pull --rebase
git push
# ---------------------------------------------------------------------------
# Job 2: fetch RegistrationDate for every row missing one
# ---------------------------------------------------------------------------
tga-dates:
name: Fetch TGA registration dates
# Run after tga-listings (if it ran) so that dates are fetched for any
# newly added listings, and so the two jobs don't write to tga_artg.csv
# at the same time and clobber each other's results.
# needs: tga-listings
# Run on schedule (after tga-listings completes), or on manual dispatch
# when run_dates is true and tga-listings either succeeded or was skipped
# (run_listings: false). Also always run when tga-listings failed or was
# cancelled (e.g. hit the 6h job timeout) so that dates are collected for
# any new rows added during a partial run — regardless of whether
# run_dates was explicitly requested.
if: inputs.run_dates == true
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
# Check out the branch that triggered the workflow so the commit
# goes back to the right place (PR branch or main).
ref: ${{ github.ref }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
cache: 'pip'
- name: Install Python dependencies
run: pip install -r requirements.txt
- name: Fetch TGA dates
timeout-minutes: 350
continue-on-error: true
run: |
python download_databases.py \
--tga-dates \
--tga-dates-delay ${{ inputs.dates_delay || '2.0' }}
env:
PYTHONUNBUFFERED: '1'
- name: Commit updated CSV
run: |
git config user.email "github-actions[bot]@users.noreply.github.com"
git config user.name "github-actions[bot]"
git add data/tga/tga_artg.csv
git diff --staged --quiet && echo "No changes to commit" && exit 0
# Count how many rows now have a date
WITH=$(python3 -c "
import csv
rows = list(csv.DictReader(open('data/tga/tga_artg.csv')))
print(sum(1 for r in rows if r.get('RegistrationDate','').strip()))
")
TOTAL=$(python3 -c "
import csv
print(sum(1 for _ in csv.DictReader(open('data/tga/tga_artg.csv'))))
")
git commit -m "Update TGA registration dates: ${WITH}/${TOTAL} rows have date [skip ci]"
# Clean up any unstaged changes (e.g., .tmp files, logs) before rebase
git add -A
git diff --staged --quiet || git commit --amend --no-edit
git pull --rebase
git push
- name: Update README with current statistics
run: |
python3 update_readme_tga_stats.py
git add README.md
git diff --staged --quiet && echo "No README changes" && exit 0
git commit -m "Update README with TGA statistics [skip ci]"
git pull --rebase
git push