Skip to content

Scrape

Scrape #1024

Workflow file for this run

# This is a basic workflow to help you get started with Actions
name: Scrape
# Controls when the workflow will run
on:
schedule:
- cron: '15 4 * * *'
- cron: '0 0 * * 6'
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
inputs:
specific_scraper:
type: choice
description: Which scraper to run?
options:
- bills
- events
- people
window:
description: How many days to scrape?
type: string
concurrency:
group: chicago-scraper
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# This workflow contains a single job called "build"
scrape:
# The type of runner that the job will run on
runs-on: ubuntu-latest
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3
- name: install dependencies
run: |
sudo apt-get update
sudo apt-get install -y libgdal-dev
pip install --upgrade pip
pip install -r requirements.txt
- name: run scraper without window
if: ${{ !inputs.window && github.event.schedule != '0 0 * * 6' }}
env:
DJANGO_SETTINGS_MODULE: pupa.settings
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: pupa update chicago ${{ inputs.specific_scraper }} --rpm=0
- name: weekly big scrape
if: ${{ !inputs.window && github.event.schedule == '0 0 * * 6' }}
env:
DJANGO_SETTINGS_MODULE: pupa.settings
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: |-
pupa update chicago people --rpm=0
pupa update chicago bills window=180 --rpm=0
pupa update chicago events window=180 --rpm=0
- name: run scraper with window
if: ${{ inputs.window }}
env:
DJANGO_SETTINGS_MODULE: pupa.settings
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: pupa update chicago ${{ inputs.specific_scraper }} window=${{ inputs.window }} --rpm=0
- name: update vote count
env:
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: psql $(echo $DATABASE_URL) -f scripts/vote_counts.sql
- name: reconstruct agendas
env:
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: psql $(echo $DATABASE_URL) -f scripts/reconstruct_agendas.sql
- name: merge terms
env:
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: psql $(echo $DATABASE_URL) -f scripts/merge_memberships.sql
- name: add topics
env:
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: make add_topics
- name: keepalive
uses: gautamkrishnar/keepalive-workflow@v1
index-and-stats:
runs-on: ubuntu-latest
needs: scrape
steps:
- name: install heroku
run: npm install -g heroku
- name: update search index
env:
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
run: heroku run -a chi-councilmatic-production "python manage.py update_index --batch-size=50 --age=1"
- name: update stats
env:
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
run: heroku run -a chi-councilmatic-production "python manage.py populate_person_statistics"
- name: clear cache
env:
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
run: heroku run -a chi-councilmatic-production "python manage.py clear_cache"
export:
# The type of runner that the job will run on
runs-on: ubuntu-latest
needs: scrape
steps:
- uses: actions/checkout@v3
- name: install dependencies
run: |
pip install "db-to-sqlite[postgresql] @ https://github.com/sgraaf/db-to-sqlite/archive/refs/heads/main.zip"
pip install "sqlalchemy<2.0"
- name: export
env:
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
run: |
db-to-sqlite $(echo $DATABASE_URL) chicago_council.db --table-name-pattern opencivicdata_*
cat scripts/rename.sql | sqlite3 chicago_council.db | sqlite3 chicago_council.db
zip chicago_council.db.zip chicago_council.db
- name: Push data
uses: WebFreak001/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # automatically provided by github actions
with:
upload_url: https://uploads.github.com/repos/datamade/chicago-council-scrapers/releases/80533645/assets{?name,label}
release_id: 80533645 # same as above (id can just be taken out the upload_url, it's used to find old releases)
asset_path: ./chicago_council.db.zip # path to archive to upload
asset_name: chicago_council.db.zip # name to upload the release as, use $$ to insert date (YYYYMMDD) and 6 letter commit hash
asset_content_type: application/zip # required by GitHub API
summaries:
# The type of runner that the job will run on
runs-on: ubuntu-latest
needs: scrape
defaults:
run:
working-directory: ./bill_summarize
steps:
- uses: actions/checkout@v3
- name: install dependencies
run: |
sudo add-apt-repository --yes ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr poppler-utils
pip install --upgrade pip
pip install -r requirements.txt
python3 -m spacy download en_core_web_sm
- name: run pipeline
env:
DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
make update_summaries_db
make summaries.csv
- name: Commit changes
uses: EndBug/add-and-commit@v7
with:
add: bill_summarize/summaries.csv
message: 'update summary'