Sampler cudagraph (#1253) #1

	name: E2E Test

	on:
	push:
	branches: [ main ]
	paths:
	- "python/sglang/**"
	- "test/**"
	pull_request:
	branches: [ main ]
	paths:
	- "python/sglang/**"
	- "test/**"
	workflow_dispatch:

	concurrency:
	group: e2e-test-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	e2e-test:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner

	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	run: \|
	pip install --upgrade pip
	pip install -e "python[all]"
	pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall

	- name: Benchmark Serving Throughput
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_serving_throughput.TestServingThroughput.test_default

	- name: Benchmark Serving Latency
	timeout-minutes: 10
	run: \|
	python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8

	- name: Benchmark Serving Throughput (w/o RadixAttention)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache

	- name: Benchmark Serving Throughput (w/o ChunkedPrefill)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill

Provide feedback