eval #39
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: eval | |
on: | |
# Run on every release | |
push: | |
tags: | |
- "*" | |
# Allow manual triggers from GitHub UI | |
workflow_dispatch: | |
inputs: | |
khoj_mode: | |
description: 'Khoj Mode (general/default/research)' | |
required: true | |
default: 'default' | |
type: choice | |
options: | |
- general | |
- default | |
- research | |
dataset: | |
description: 'Dataset to evaluate (frames/simpleqa)' | |
required: true | |
default: 'frames' | |
type: choice | |
options: | |
- frames | |
- simpleqa | |
- gpqa | |
- math500 | |
sample_size: | |
description: 'Number of samples to evaluate' | |
required: false | |
default: 200 | |
type: number | |
jobs: | |
eval: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
# Use input from manual trigger if available, else run all combinations | |
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }} | |
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }} | |
services: | |
postgres: | |
image: ankane/pgvector | |
env: | |
POSTGRES_PASSWORD: postgres | |
POSTGRES_USER: postgres | |
POSTGRES_DB: postgres | |
ports: | |
- 5432:5432 | |
options: >- | |
--health-cmd pg_isready | |
--health-interval 10s | |
--health-timeout 5s | |
--health-retries 5 | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10' | |
- name: Get App Version | |
id: hatch | |
run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT | |
- name: ⏬️ Install Dependencies | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
run: | | |
# install postgres and other dependencies | |
apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6 | |
apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14 | |
# upgrade pip | |
python -m ensurepip --upgrade && python -m pip install --upgrade pip | |
# install terrarium for code sandbox | |
git clone https://github.com/cohere-ai/cohere-terrarium.git && cd cohere-terrarium && npm install && mkdir pyodide_cache | |
- name: ⬇️ Install Application | |
run: | | |
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml | |
pip install --upgrade .[dev] | |
- name: 📝 Run Eval | |
env: | |
KHOJ_MODE: ${{ matrix.khoj_mode }} | |
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }} | |
BATCH_SIZE: "20" | |
RANDOMIZE: "True" | |
KHOJ_URL: "http://localhost:42110" | |
KHOJ_LLM_SEED: "42" | |
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | |
SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }} | |
OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY }} | |
HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
KHOJ_ADMIN_EMAIL: khoj | |
KHOJ_ADMIN_PASSWORD: khoj | |
POSTGRES_HOST: localhost | |
POSTGRES_PORT: 5432 | |
POSTGRES_USER: postgres | |
POSTGRES_PASSWORD: postgres | |
POSTGRES_DB: postgres | |
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests | |
run: | | |
# Start Khoj server in background | |
khoj --anonymous-mode --non-interactive & | |
# Start code sandbox | |
npm run dev --prefix cohere-terrarium & | |
# Wait for server to be ready | |
timeout=120 | |
while ! curl -s http://localhost:42110/api/health > /dev/null; do | |
if [ $timeout -le 0 ]; then | |
echo "Timed out waiting for Khoj server" | |
exit 1 | |
fi | |
echo "Waiting for Khoj server..." | |
sleep 2 | |
timeout=$((timeout-2)) | |
done | |
# Run evals | |
python tests/evals/eval.py -d ${{ matrix.dataset }} | |
- name: Upload Results | |
if: always() # Upload results even if tests fail | |
uses: actions/upload-artifact@v3 | |
with: | |
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }} | |
path: | | |
*_evaluation_results_*.csv | |
*_evaluation_summary_*.txt | |
- name: Display Results | |
if: always() | |
run: | | |
# Read and display summary | |
echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY | |
echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY | |
echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY | |
echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY | |
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
# Display in logs too | |
echo "===== EVALUATION RESULTS =====" | |
cat *_evaluation_summary_*.txt |