.github/workflows/scrape.yml

# This is a basic workflow to help you get started with Actions

name: Scrape

# Controls when the workflow will run
on:
  
  schedule:
    - cron: '15 4 * * *'
    - cron: '0 0 * * 6'
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:
    inputs:
      specific_scraper:
        type: choice
        description: Which scraper to run?
        options: 
        - bills
        - events
        - people
      window:
        description: How many days to scrape?
        type: string

concurrency:
  group: chicago-scraper
    
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
  # This workflow contains a single job called "build"
  scrape:
    # The type of runner that the job will run on
    runs-on: ubuntu-latest

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
      - uses: actions/checkout@v3
      - name: install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y libgdal-dev
          pip install --upgrade pip
          pip install -r requirements.txt
          
      - name: run scraper without window
        if: ${{ !inputs.window && github.event.schedule != '0 0 * * 6' }}
        env:
          DJANGO_SETTINGS_MODULE: pupa.settings
          SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: pupa update chicago ${{ inputs.specific_scraper }} --rpm=0

      - name: weekly big scrape
        if: ${{ !inputs.window && github.event.schedule == '0 0 * * 6' }}
        env:
          DJANGO_SETTINGS_MODULE: pupa.settings
          SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: |-
          pupa update chicago people --rpm=0
          pupa update chicago bills window=180 --rpm=0
          pupa update chicago events window=180 --rpm=0
      - name: run scraper with window
        if: ${{ inputs.window }}
        env:
          DJANGO_SETTINGS_MODULE: pupa.settings
          SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: pupa update chicago ${{ inputs.specific_scraper }} window=${{ inputs.window }} --rpm=0
        
      - name: update vote count
        env:
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: psql $(echo $DATABASE_URL) -f scripts/vote_counts.sql
      - name: reconstruct agendas
        env:
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: psql $(echo $DATABASE_URL) -f scripts/reconstruct_agendas.sql
      - name: merge terms
        env:
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: psql $(echo $DATABASE_URL) -f scripts/merge_memberships.sql
      - name: add topics
        env:
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: make add_topics
      - name: keepalive
        uses: gautamkrishnar/keepalive-workflow@v1
  index-and-stats:
    runs-on: ubuntu-latest
    needs: scrape
    steps:
      - name: install heroku
        run: npm install -g heroku
      - name: update search index
        env: 
          HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
        run: heroku run -a chi-councilmatic-production "python manage.py update_index --batch-size=50 --age=1"
      - name: update stats
        env: 
          HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
        run: heroku run -a chi-councilmatic-production "python manage.py populate_person_statistics"
      - name: clear cache
        env: 
          HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
        run: heroku run -a chi-councilmatic-production "python manage.py clear_cache"

  export:
    # The type of runner that the job will run on
    runs-on: ubuntu-latest
    needs: scrape
    
    steps:
      - uses: actions/checkout@v3
      - name: install dependencies
        run: |
          pip install "db-to-sqlite[postgresql] @ https://github.com/sgraaf/db-to-sqlite/archive/refs/heads/main.zip"
          pip install "sqlalchemy<2.0"
      - name: export
        env:
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
        run: |
          db-to-sqlite $(echo $DATABASE_URL) chicago_council.db --table-name-pattern opencivicdata_*
          cat scripts/rename.sql | sqlite3 chicago_council.db | sqlite3 chicago_council.db
          zip chicago_council.db.zip chicago_council.db
      - name: Push data
        uses: WebFreak001/deploy-nightly@v1.1.0
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # automatically provided by github actions
        with:
          upload_url: https://uploads.github.com/repos/datamade/chicago-council-scrapers/releases/80533645/assets{?name,label}
          release_id: 80533645 # same as above (id can just be taken out the upload_url, it's used to find old releases)
          asset_path: ./chicago_council.db.zip # path to archive to upload
          asset_name: chicago_council.db.zip # name to upload the release as, use $$ to insert date (YYYYMMDD) and 6 letter commit hash
          asset_content_type: application/zip # required by GitHub API
      
  summaries:
    # The type of runner that the job will run on
    runs-on: ubuntu-latest
    needs: scrape
   
    defaults:
      run:
        working-directory: ./bill_summarize
    steps:
      - uses: actions/checkout@v3
      - name: install dependencies
        run: |
          sudo add-apt-repository --yes ppa:alex-p/tesseract-ocr5
          sudo apt-get update
          sudo apt-get install -y tesseract-ocr poppler-utils
          pip install --upgrade pip
          pip install -r requirements.txt
          python3 -m spacy download en_core_web_sm
      - name: run pipeline
        env:
          DATABASE_URL: ${{ secrets.DB_CONNECTION_STRING }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          make update_summaries_db
          make summaries.csv
      - name: Commit changes
        uses: EndBug/add-and-commit@v7
        with:
          add: bill_summarize/summaries.csv
          message: 'update summary'