Skip to content

Add workflow to gen data (pre-squashed) #363

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: feat/pull-v2-api
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
14 changes: 14 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,17 @@ ENV PYTHONUNBUFFERED 1
RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
&& apt-get -y install --no-install-recommends postgresql-client ruby-full

# install rvm
RUN curl -sSL https://rvm.io/mpapis.asc | gpg --import -
RUN curl -sSL https://rvm.io/pkuczynski.asc | gpg --import -
RUN curl -sSL https://get.rvm.io | bash -s stable
RUN usermod -a -G rvm vscode
# for now, we can switch to ruby 2.7.1 by running .devcontainer/install-ruby-2.7.1.sh


COPY ./.devcontainer/post-create-command.sh /scripts/
COPY ./requirements.txt requirements.txt
COPY ./Gemfile Gemfile
COPY ./Gemfile.lock Gemfile.lock
RUN bash /scripts/post-create-command.sh

1 change: 0 additions & 1 deletion .devcontainer/README
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@

The dev container can be used to work on the project in a consistent environment independent of what machine you are working on. When working in the dev container, you will have a postgres instance running. You can access the postgres instance just by running psql.


1 change: 0 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"service": "app",
"workspaceFolder": "/workspace",
"remoteUser": "vscode",
"postCreateCommand": "bash ./.devcontainer/post-create-command.sh",
"postStartCommand": "git config --global --add safe.directory ${containerWorkspaceFolder}",
"forwardPorts": [4567, 5432],
"extensions": [
Expand Down
4 changes: 1 addition & 3 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,10 @@ services:
network_mode: service:db

db:
image: postgres:latest
# image: postgres:15.4
image: postgres:15.3-bookworm
restart: unless-stopped
volumes:
- postgres-data:/var/lib/postgresql/data
#- ../pgdata:/var/lib/postgresql/data
environment:
POSTGRES_USER: app_user
POSTGRES_DB: "disclosure-backend"
Expand Down
5 changes: 0 additions & 5 deletions .devcontainer/enable-ssh-agent.ps1

This file was deleted.

19 changes: 19 additions & 0 deletions .devcontainer/install-ruby-2.7.1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash --login

# Here's a way to install old ruby 2.7.1 using rvm on ubuntu bookworm
# https://github.com/rvm/rvm/issues/5209

sudo apt install build-essential
cd ~/Downloads
wget https://www.openssl.org/source/openssl-1.1.1t.tar.gz
tar zxvf openssl-1.1.1t.tar.gz
cd openssl-1.1.1t
./config --prefix=$HOME/.openssl/openssl-1.1.1t --openssldir=$HOME/.openssl/openssl-1.1.1t
make
make install
rm -rf ~/.openssl/openssl-1.1.1t/certs
ln -s /etc/ssl/certs ~/.openssl/openssl-1.1.1t/certs
cd ~
rvm install ruby-2.7.1 --with-openssl-dir=$HOME/.openssl/openssl-1.1.1t # replace ruby-x.x.x to install other older versions

rvm use 2.7.1
5 changes: 4 additions & 1 deletion .devcontainer/post-create-command.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/bin/bash

set -e

pip install --upgrade pip
#pip install 'urllib3[secure]'
pip install -r requirements.txt
pip install -r gdrive_requirements.txt
#pip install -r download/requirements.txt

sudo gem install pg bundler
sudo bundle install
120 changes: 103 additions & 17 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,25 +1,111 @@
# This workflow will later be replaced with logic to "Generate Website Data"
# The verify-gdrive.yml workflow file will be renamed to this one
# We have to introduce this change in steps because GitHub gets confused until
# we add the new workflow file to the master branch
name: "Generate Website Data"
on:
workflow_dispatch:
push:
env:
POSTGRES_USER: app_user
POSTGRES_DB: disclosure-backend
POSTGRES_PASSWORD: app_password
jobs:
generate:
build:
runs-on: ubuntu-latest
env:
REPO_OWNER: ${{ github.repository_owner}}
REPO_BRANCH: ${{ github.ref_name }}
SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }}
GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }}
outputs:
devcontainer: ${{ steps.filter.outputs.devcontainer }}
noncontainer: ${{ steps.filter.outputs.noncontainer }}
steps:
- uses: actions/checkout@v3
- run: pip install -r gdrive_requirements.txt
- run: python test_pull_from_gdrive.py
- name: Archive pulled files
uses: actions/upload-artifact@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
name: redacted-netfile-files
path: .local/downloads
registry: ghcr.io
username: ${{github.actor}}
password: ${{secrets.GITHUB_TOKEN}}
- uses: actions/checkout@v3
- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@v40
- name: List all changed files
id: filter
run: |
echo ${{github.event_name}}
noncontainer=true
if docker pull ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest; then
devcontainer=false
else
devcontainer=true
fi
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
echo "$file was changed"
if [[ ${{github.event_name}} = push ]]; then
if [[ $file = .devcontainer* ]]; then
devcontainer=true
elif [[ $file = *requirements.txt* ]]; then
devcontainer=true
elif [[ $file = Gemfile* ]]; then
devcontainer=true
fi
fi
done

echo "devcontainer=$devcontainer" >> $GITHUB_OUTPUT
echo "noncontainer=$noncontainer" >> $GITHUB_OUTPUT
- name: Build dev container
if: steps.filter.outputs.devcontainer == 'true'
run: |
docker build --no-cache --tag ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest -f ./.devcontainer/Dockerfile .
docker push ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest
- name: Check code changes
if: steps.filter.outputs.noncontainer == 'true'
run: |
echo "TODO: run test to verify that code changes are good"
generate:
needs: build
if: needs.build.outputs.noncontainer == 'true'
runs-on: ubuntu-latest
container:
image: ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.github_token }}
env:
REPO_OWNER: ${{ github.repository_owner}}
REPO_BRANCH: ${{ github.ref_name }}
SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }}
GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }}
PGHOST: postgres
PGDATABASE: ${{ env.POSTGRES_DB }}
PGUSER: ${{ env.POSTGRES_USER }}
PGPASSWORD: ${{ env.POSTGRES_PASSWORD }}
services:
postgres:
image: postgres:15.6-bullseye
env:
POSTGRES_USER: ${{ env.POSTGRES_USER }}
POSTGRES_DB: ${{ env.POSTGRES_DB }}
POSTGRES_PASSWORD: ${{ env.POSTGRES_PASSWORD }}
steps:
- uses: actions/checkout@v4
- name: Check setup
run: |
git -v
# This keeps git from thinking that the current dir is not a repo even though a .git dir exists
git config --global --add safe.directory "$GITHUB_WORKSPACE"
psql -l
echo "c1,c2" > test.csv
echo "a,b" >> test.csv
cat test.csv
csvsql -v --db postgresql:///disclosure-backend --insert test.csv
echo "List tables"
psql -c "SELECT * FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';"

pip show sqlalchemy
- name: Create csv files
run: |
make clean
make download
make import
make process
- name: Summarize results
run: |
echo "List tables"
psql -c "SELECT * FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema';"

10 changes: 8 additions & 2 deletions .github/workflows/verify-gdrive.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,21 @@ on:
jobs:
check:
runs-on: ubuntu-latest
container:
image: ghcr.io/caciviclab/disclosure-backend-static/${{github.ref_name}}:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.github_token }}

env:
REPO_OWNER: ${{ github.repository_owner}}
REPO_BRANCH: ${{ github.ref_name }}
SERVICE_ACCOUNT_KEY_JSON: ${{ secrets.SERVICE_ACCOUNT_KEY_JSON }}
GDRIVE_FOLDER: ${{ vars.GDRIVE_FOLDER }}
steps:
- uses: actions/checkout@v3
- run: pip install -r gdrive_requirements.txt
- run: python test_pull_from_gdrive.py
- name: Test pull from gdrive
run: python test_pull_from_gdrive.py
- name: Archive pulled files
uses: actions/upload-artifact@v3
with:
Expand Down
38 changes: 33 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,23 @@ CSV_PATH?=downloads/csv
CD := $(shell pwd)
WGET=bin/wget-wrapper --no-verbose --tries=3

ifdef SERVICE_ACCOUNT_KEY_JSON
NETFILE_V2_DOWNLOAD=download-netfile-v2
NETFILE_V2_IMPORT=import-new-data
else ifneq ("$(wildcard .local/SERVICE_ACCOUNT_KEY_JSON.json)","")
NETFILE_V2_DOWNLOAD=download-netfile-v2
NETFILE_V2_IMPORT=import-new-data
endif

clean-spreadsheets:
rm -rf downloads/csv/*.csv downloads/csv/office_elections.csv downloads/csv/measure_committees.csv downloads/csv/elections.csv

clean:
rm -rf downloads/raw downloads/csv
rm -rf downloads/raw downloads/csv .local/downloads .local/csv
git --version
python --version
ruby --version
psql --version

process: process.rb
# todo: remove RUBYOPT variable when activerecord fixes deprecation warnings
Expand All @@ -21,6 +33,9 @@ process: process.rb
bin/report-candidates
git --no-pager diff build/digests.json

download-netfile-v2:
python download/main.py

download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \
downloads/csv/referendums.csv downloads/csv/name_to_number.csv \
downloads/csv/office_elections.csv downloads/csv/elections.csv
Expand All @@ -36,7 +51,8 @@ upload-cache:
tar czf - downloads/csv downloads/static downloads/cached-db \
| aws s3 cp - s3://odca-data-cache/$(shell date +%Y-%m-%d).tar.gz --acl public-read

download: download-spreadsheets \
download: $(NETFILE_V2_DOWNLOAD) \
download-spreadsheets \
download-COAK-2014 download-COAK-2015 download-COAK-2016 \
download-COAK-2017 download-COAK-2018 \
download-COAK-2019 download-COAK-2020 \
Expand Down Expand Up @@ -81,13 +97,16 @@ do-import-spreadsheets:
./bin/remove-whitespace $(DATABASE_NAME) candidates Instagram
./bin/remove-whitespace $(DATABASE_NAME) candidates Twitter
./bin/remove-whitespace $(DATABASE_NAME) candidates Bio
./bin/make-null-empty $(DATABASE_NAME) candidates data_warning
./bin/make-null-empty $(DATABASE_NAME) candidates Committee_Name

echo 'DROP TABLE IF EXISTS referendums CASCADE;' | psql $(DATABASE_NAME)
./bin/create-table $(DATABASE_NAME) $(CSV_PATH) referendums
csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/referendums.csv
echo 'ALTER TABLE "referendums" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME)
./bin/remove-whitespace $(DATABASE_NAME) referendums Short_Title
./bin/remove-whitespace $(DATABASE_NAME) referendums Summary
./bin/make-null-empty $(DATABASE_NAME) referendums data_warning

echo 'DROP TABLE IF EXISTS name_to_number CASCADE;' | psql $(DATABASE_NAME)
./bin/create-table $(DATABASE_NAME) $(CSV_PATH) name_to_number
Expand All @@ -98,6 +117,8 @@ do-import-spreadsheets:
csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/committees.csv
echo 'ALTER TABLE "committees" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME)
./bin/remove-whitespace $(DATABASE_NAME) committees Filer_NamL
./bin/make-null-empty $(DATABASE_NAME) committees Filer_NamL
./bin/make-null-empty $(DATABASE_NAME) committees data_warning

echo 'DROP TABLE IF EXISTS office_elections CASCADE;' | psql $(DATABASE_NAME)
./bin/create-table $(DATABASE_NAME) $(CSV_PATH) office_elections
Expand All @@ -110,9 +131,7 @@ do-import-spreadsheets:
csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference downloads/csv/elections.csv
echo 'ALTER TABLE "elections" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME)

import-data: 496 497 A-Contributions B1-Loans B2-Loans C-Contributions \
D-Expenditure E-Expenditure F-Expenses F461P5-Expenditure F465P3-Expenditure \
F496P3-Contributions G-Expenditure H-Loans I-Contributions Summary
import-data: import-old-data $(NETFILE_V2_IMPORT)
echo 'CREATE TABLE IF NOT EXISTS "calculations" (id SERIAL PRIMARY KEY, subject_id integer, subject_type varchar(30), name varchar(40), value jsonb);' | psql $(DATABASE_NAME)
./bin/remove_duplicate_transactions
./bin/make_view
Expand All @@ -124,9 +143,18 @@ recreatedb:
reindex:
ruby search_index.rb

import-new-data: elections_v2 committees_v2 a_contributions_v2

import-old-data: 496 497 A-Contributions B1-Loans B2-Loans C-Contributions \
D-Expenditure E-Expenditure F-Expenses F461P5-Expenditure F465P3-Expenditure \
F496P3-Contributions G-Expenditure H-Loans I-Contributions Summary

496 497 A-Contributions B1-Loans B2-Loans C-Contributions D-Expenditure E-Expenditure F-Expenses F461P5-Expenditure F465P3-Expenditure F496P3-Contributions G-Expenditure H-Loans I-Contributions Summary:
DATABASE_NAME=$(DATABASE_NAME) ./bin/import-file $(CSV_PATH) $@

elections_v2 committees_v2 a_contributions_v2:
DATABASE_NAME=$(DATABASE_NAME) ./bin/import-file $(CSV_PATH) $@ 0

downloads/csv/candidates.csv:
mkdir -p downloads/csv downloads/raw
$(WGET) -O- \
Expand Down
2 changes: 2 additions & 0 deletions bin/clean
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ cat <<-QUERY | psql ${database_name}
DELETE FROM "$table_name"
WHERE "Tran_Date" is NULL;
QUERY
else
echo
fi
Loading