Scrape #1024
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a basic workflow to help you get started with Actions | |
name: Scrape | |
# Controls when the workflow will run | |
on: | |
schedule: | |
- cron: '15 4 * * *' | |
- cron: '0 0 * * 6' | |
# Allows you to run this workflow manually from the Actions tab | |
workflow_dispatch: | |
inputs: | |
specific_scraper: | |
type: choice | |
description: Which scraper to run? | |
options: | |
- bills | |
- events | |
- people | |
window: | |
description: How many days to scrape? | |
type: string | |
concurrency: | |
group: chicago-scraper | |
# A workflow run is made up of one or more jobs that can run sequentially or in parallel | |
jobs: | |
# This workflow contains a single job called "build" | |
scrape: | |
# The type of runner that the job will run on | |
runs-on: ubuntu-latest | |
# Steps represent a sequence of tasks that will be executed as part of the job | |
steps: | |
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it | |
- uses: actions/checkout@v3 | |
- name: install dependencies | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y libgdal-dev | |
pip install --upgrade pip | |
pip install -r requirements.txt | |
- name: run scraper without window | |
if: ${{ !inputs.window && github.event.schedule != '0 0 * * 6' }} | |
env: | |
DJANGO_SETTINGS_MODULE: pupa.settings | |
SENTRY_DSN: ${{ secrets.SENTRY_DSN }} | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: pupa update chicago ${{ inputs.specific_scraper }} --rpm=0 | |
- name: weekly big scrape | |
if: ${{ !inputs.window && github.event.schedule == '0 0 * * 6' }} | |
env: | |
DJANGO_SETTINGS_MODULE: pupa.settings | |
SENTRY_DSN: ${{ secrets.SENTRY_DSN }} | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: |- | |
pupa update chicago people --rpm=0 | |
pupa update chicago bills window=180 --rpm=0 | |
pupa update chicago events window=180 --rpm=0 | |
- name: run scraper with window | |
if: ${{ inputs.window }} | |
env: | |
DJANGO_SETTINGS_MODULE: pupa.settings | |
SENTRY_DSN: ${{ secrets.SENTRY_DSN }} | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: pupa update chicago ${{ inputs.specific_scraper }} window=${{ inputs.window }} --rpm=0 | |
- name: update vote count | |
env: | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: psql $(echo $DATABASE_URL) -f scripts/vote_counts.sql | |
- name: reconstruct agendas | |
env: | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: psql $(echo $DATABASE_URL) -f scripts/reconstruct_agendas.sql | |
- name: merge terms | |
env: | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: psql $(echo $DATABASE_URL) -f scripts/merge_memberships.sql | |
- name: add topics | |
env: | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: make add_topics | |
- name: keepalive | |
uses: gautamkrishnar/keepalive-workflow@v1 | |
index-and-stats: | |
runs-on: ubuntu-latest | |
needs: scrape | |
steps: | |
- name: install heroku | |
run: npm install -g heroku | |
- name: update search index | |
env: | |
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} | |
run: heroku run -a chi-councilmatic-production "python manage.py update_index --batch-size=50 --age=1" | |
- name: update stats | |
env: | |
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} | |
run: heroku run -a chi-councilmatic-production "python manage.py populate_person_statistics" | |
- name: clear cache | |
env: | |
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} | |
run: heroku run -a chi-councilmatic-production "python manage.py clear_cache" | |
export: | |
# The type of runner that the job will run on | |
runs-on: ubuntu-latest | |
needs: scrape | |
steps: | |
- uses: actions/checkout@v3 | |
- name: install dependencies | |
run: | | |
pip install "db-to-sqlite[postgresql] @ https://github.com/sgraaf/db-to-sqlite/archive/refs/heads/main.zip" | |
pip install "sqlalchemy<2.0" | |
- name: export | |
env: | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
run: | | |
db-to-sqlite $(echo $DATABASE_URL) chicago_council.db --table-name-pattern opencivicdata_* | |
cat scripts/rename.sql | sqlite3 chicago_council.db | sqlite3 chicago_council.db | |
zip chicago_council.db.zip chicago_council.db | |
- name: Push data | |
uses: WebFreak001/[email protected] | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # automatically provided by github actions | |
with: | |
upload_url: https://uploads.github.com/repos/datamade/chicago-council-scrapers/releases/80533645/assets{?name,label} | |
release_id: 80533645 # same as above (id can just be taken out the upload_url, it's used to find old releases) | |
asset_path: ./chicago_council.db.zip # path to archive to upload | |
asset_name: chicago_council.db.zip # name to upload the release as, use $$ to insert date (YYYYMMDD) and 6 letter commit hash | |
asset_content_type: application/zip # required by GitHub API | |
summaries: | |
# The type of runner that the job will run on | |
runs-on: ubuntu-latest | |
needs: scrape | |
defaults: | |
run: | |
working-directory: ./bill_summarize | |
steps: | |
- uses: actions/checkout@v3 | |
- name: install dependencies | |
run: | | |
sudo add-apt-repository --yes ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr poppler-utils | |
pip install --upgrade pip | |
pip install -r requirements.txt | |
python3 -m spacy download en_core_web_sm | |
- name: run pipeline | |
env: | |
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
run: | | |
make update_summaries_db | |
make summaries.csv | |
- name: Commit changes | |
uses: EndBug/add-and-commit@v7 | |
with: | |
add: bill_summarize/summaries.csv | |
message: 'update summary' | |