From 89f23a5178769ce867e8fb3af3cb44da2f5399ec Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 12 Aug 2024 16:11:38 +0800 Subject: [PATCH 001/118] docs: update setup github runner (#1050) --- docs/en/setup_github_runner.md | 89 ++++++++++++++++++++++++++++++++++ docs/en/setup_runner.md | 34 ------------- 2 files changed, 89 insertions(+), 34 deletions(-) create mode 100644 docs/en/setup_github_runner.md delete mode 100644 docs/en/setup_runner.md diff --git a/docs/en/setup_github_runner.md b/docs/en/setup_github_runner.md new file mode 100644 index 0000000000..97a7f26266 --- /dev/null +++ b/docs/en/setup_github_runner.md @@ -0,0 +1,89 @@ +# Set up self hosted runner for GitHub Action + +## Config Runner + +```bash +# https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux +# Involves some TOKEN and other private information, click the link to view specific steps. +``` + +## Start Runner + +add `/lib/systemd/system/e2e.service` +``` +[Unit] +StartLimitIntervalSec=0 +[Service] +Environment="CUDA_VISIBLE_DEVICES=7" +Environment="XDG_CACHE_HOME=/data/.cache" +Environment="HF_TOKEN=hf_xx" +Environment="OPENAI_API_KEY=sk-xx" +Environment="HOME=/data/zhyncs/runner-v1" +Environment="SGLANG_IS_IN_CI=true" +Restart=always +RestartSec=1 +ExecStart=/data/zhyncs/runner-v1/actions-runner/run.sh +[Install] +WantedBy=multi-user.target +``` + +add `/lib/systemd/system/unit.service` +``` +[Unit] +StartLimitIntervalSec=0 +[Service] +Environment="CUDA_VISIBLE_DEVICES=6" +Environment="XDG_CACHE_HOME=/data/.cache" +Environment="HF_TOKEN=hf_xx" +Environment="OPENAI_API_KEY=sk-xx" +Environment="HOME=/data/zhyncs/runner-v2" +Environment="SGLANG_IS_IN_CI=true" +Restart=always +RestartSec=1 +ExecStart=/data/zhyncs/runner-v2/actions-runner/run.sh +[Install] +WantedBy=multi-user.target +``` + +add `/lib/systemd/system/accuracy.service` +``` +[Unit] +StartLimitIntervalSec=0 +[Service] +Environment="CUDA_VISIBLE_DEVICES=5" +Environment="XDG_CACHE_HOME=/data/.cache" +Environment="HF_TOKEN=hf_xx" +Environment="OPENAI_API_KEY=sk-xx" +Environment="HOME=/data/zhyncs/runner-v3" +Environment="SGLANG_IS_IN_CI=true" +Restart=always +RestartSec=1 +ExecStart=/data/zhyncs/runner-v3/actions-runner/run.sh +[Install] +WantedBy=multi-user.target +``` + +```bash +cd /data/zhyncs/runner-v1 +python3 -m venv venv + +cd /data/zhyncs/runner-v2 +python3 -m venv venv + +cd /data/zhyncs/runner-v3 +python3 -m venv venv + +sudo systemctl daemon-reload + +sudo systemctl start e2e +sudo systemctl enable e2e +sudo systemctl status e2e + +sudo systemctl start unit +sudo systemctl enable unit +sudo systemctl status unit + +sudo systemctl start accuracy +sudo systemctl enable accuracy +sudo systemctl status accuracy +``` diff --git a/docs/en/setup_runner.md b/docs/en/setup_runner.md deleted file mode 100644 index 34f4576845..0000000000 --- a/docs/en/setup_runner.md +++ /dev/null @@ -1,34 +0,0 @@ -# Set up self hosted runner for GitHub Action - -## Config Runner - -```bash -# https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux -# Involves some TOKEN and other private information, click the link to view specific steps. -``` - -## Start Runner - -add `/lib/systemd/system/runner.service` -``` -[Unit] -StartLimitIntervalSec=0 -[Service] -Environment="CUDA_VISIBLE_DEVICES=7" -Environment="XDG_CACHE_HOME=/data/.cache" -Environment="HF_TOKEN=hf_**" -Environment="OPENAI_API_KEY=sk-**" -Environment="HOME=/data/zhyncs" -Restart=always -RestartSec=1 -ExecStart=/data/zhyncs/actions-runner/run.sh -[Install] -WantedBy=multi-user.target -``` - -```bash -sudo systemctl daemon-reload -sudo systemctl start runner -sudo systemctl enable runner -sudo systemctl status runner -``` From 41598e0d8e7de0aa777941c4ff5e1fddfb6f573c Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 Aug 2024 02:21:38 -0700 Subject: [PATCH 002/118] Add longer accuracy test on CI (#1049) --- .github/workflows/accuracy-test.yml | 45 ++++ .github/workflows/e2e-test.yml | 2 +- README.md | 2 +- python/sglang/bench_serving.py | 19 +- python/sglang/srt/managers/tp_worker.py | 3 +- python/sglang/test/run_eval.py | 8 + python/sglang/test/simple_eval_mgsm.py | 203 ++++++++++++++++++ test/srt/run_suite.py | 2 +- test/srt/test_chunked_prefill.py | 49 +++-- test/srt/test_eval_accuracy_large.py | 68 ++++++ ...accuracy.py => test_eval_accuracy_mini.py} | 8 +- test/srt/test_serving_throughput.py | 14 +- test/srt/test_torch_compile.py | 8 +- 13 files changed, 386 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/accuracy-test.yml create mode 100644 python/sglang/test/simple_eval_mgsm.py create mode 100644 test/srt/test_eval_accuracy_large.py rename test/srt/{test_eval_accuracy.py => test_eval_accuracy_mini.py} (85%) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml new file mode 100644 index 0000000000..9c8e7bfeb4 --- /dev/null +++ b/.github/workflows/accuracy-test.yml @@ -0,0 +1,45 @@ +name: Accuracy Test + +on: + push: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + pull_request: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + workflow_dispatch: + +concurrency: + group: accuracy-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + accuracy-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: accuracy + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + source $HOME/venv/bin/activate + echo "$HOME/venv/bin" >> $GITHUB_PATH + + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . + + - name: Evaluate Accuracy + run: | + cd test/srt + python3 test_eval_accuracy_large.py diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 78ac4d9ec7..336f6a14f7 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: e2e-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: bench + runs-on: e2e steps: - name: Checkout code diff --git a/README.md b/README.md index f81593ef6d..1d7ff009b7 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance. - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size. ``` -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 2048 +python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096 ``` - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port. ``` diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index e3a2ad0a2c..0f9c882234 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -669,19 +669,20 @@ async def benchmark( "backend": args.backend, "dataset_name": args.dataset_name, "request_rate": request_rate, - "total_input": metrics.total_input, - "total_output": metrics.total_output, - "total_output_retokenized": metrics.total_output_retokenized, - "mean_e2e_latency": metrics.mean_e2e_latency_ms, - "median_e2e_latency": metrics.median_e2e_latency_ms, - "median_ttft": metrics.median_ttft_ms, - "median_itl": metrics.median_itl_ms, - "output_token_throughput": metrics.output_throughput, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "total_output_tokens_retokenized": metrics.total_output_retokenized, + "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms, + "median_e2e_latency_ms": metrics.median_e2e_latency_ms, + "median_ttft_ms": metrics.median_ttft_ms, + "median_itl_ms": metrics.median_itl_ms, + "output_throughput": metrics.output_throughput, "sharegpt_output_len": args.sharegpt_output_len, "random_input_len": args.random_input_len, "random_output_len": args.random_output_len, "random_range_ratio": args.random_range_ratio, - "benchmark_duration": benchmark_duration, + "duration": benchmark_duration, + "completed": metrics.completed, } else: print(f"Error running benchmark for request rate: {request_rate}") diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 4c757737ec..f148852630 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -64,8 +64,7 @@ logger = logging.getLogger(__name__) -# TODO: Rename "CI" to "SGLANG_IS_IN_CI". -crash_on_warning = os.getenv("CI", "false") == "true" +crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true" class ModelTpServer: diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index 6c1f284b16..3d13d475b0 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -39,6 +39,14 @@ def run_eval(args): eval_obj = MathEval( filename, equality_checker, args.num_examples, args.num_threads ) + elif args.eval_name == "mgsm": + from sglang.test.simple_eval_mgsm import MGSMEval + + eval_obj = MGSMEval(args.num_examples, args.num_threads) + elif args.eval_name == "mgsm_en": + from sglang.test.simple_eval_mgsm import MGSMEval + + eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"]) elif args.eval_name == "gpqa": from sglang.test.simple_eval_gpqa import GPQAEval diff --git a/python/sglang/test/simple_eval_mgsm.py b/python/sglang/test/simple_eval_mgsm.py new file mode 100644 index 0000000000..ce00a1ac76 --- /dev/null +++ b/python/sglang/test/simple_eval_mgsm.py @@ -0,0 +1,203 @@ +# Adapted from https://github.com/openai/simple-evals/ + +""" +MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems. +Language Models are Multilingual Chain-of-Thought Reasoners +Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei +https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp +""" + +import re +import urllib +from typing import Optional + +from sglang.test import simple_eval_common as common +from sglang.test.simple_eval_common import ( + HTML_JINJA, + Eval, + EvalResult, + SamplerBase, + SingleEvalResult, +) + +ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"] +LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"] +NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"] + +LANG_TO_FPATH = { + "bn": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_bn.tsv", + "de": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_de.tsv", + "en": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_en.tsv", + "es": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_es.tsv", + "fr": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_fr.tsv", + "ja": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ja.tsv", + "ru": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ru.tsv", + "sw": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_sw.tsv", + "te": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_te.tsv", + "th": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_th.tsv", + "zh": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_zh.tsv", +} +LANG_TO_INSTRUCTIONS = { + "en": """Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:". + +{input}""", + "bn": """এই গণিতের সমস্যাটি সমাধান করুন। চূড়ান্ত উত্তর দেওয়ার আগে যুক্তিসম্পন্ন পদক্ষেপ প্রদান করুন। চূড়ান্ত উত্তরটি একক সংখ্যা হিসাবে "উত্তর:" এর পরে শেষ লাইনে দিন। "উত্তর:" এর পরে অন্য কিছু যুক্ত করবেন না।. + +{input}""", + "de": """Löse dieses Mathematikproblem. Gib die Schritte zur Begründung an, bevor du die endgültige Antwort in der letzten Zeile alleine im Format "Antwort:" gibst. Füge nichts anderes als die ganzzahlige Antwort nach "Antwort:" hinzu. + +{input}""", + "es": """Resuelve este problema matemático. Proporciona los pasos de razonamiento antes de dar la respuesta final en la última línea por sí misma en el formato de "Respuesta:". No añadas nada más que la respuesta entera después de "Respuesta:". + +{input}""", + "fr": """Résolvez ce problème de mathématiques. Donnez les étapes de raisonnement avant de fournir la réponse finale sur la dernière ligne elle-même dans le format de "Réponse:". N'ajoutez rien d'autre que la réponse entière après "Réponse:". + +{input}""", + "ja": """の数学の問題を解いてください。最終的な答えを出す前に、解答の推論過程を記述してください。そして最後の行には "答え:" の形式で答えを記述し、その後には整数の答え以外何も追加しないでください。 + +{input}""", + "ru": """Решите эту математическую задачу. Объясните шаги рассуждения перед тем, как дать окончательный ответ в последней строке сам по себе в формате "Ответ:". Не добавляйте ничего, кроме целочисленного ответа после "Ответ:". + +{input}""", + "sw": """Suluhisha tatizo hili la hesabu. Toa hatua za mantiki kabla ya kutoa jibu la mwisho kwenye mstari wa mwisho peke yake katika muundo wa "Jibu:". Usiongeze chochote kingine isipokuwa jibu la integer baada ya "Jibu:". + +{input}""", + "te": """ఈ గణిత సమస్యను పరిష్కరించండి. చివరి సమాధానాన్ని ఇవ్వదానికి ముందు తర్కాత్మక అదుగులను ఇవ్వండి. చివరి పంక్తిలో మాత్రమే 'సమాధానం:' అనే ఆకారంలో చివరి సమాధానాద్ని ఇవ్వండి సమాధానం: తర్వాత పూర్ణాంక సమాధానానికి తప్పించి ఎదేనా చేర్చవద్దు. + +{input}""", + "th": """แก้ปัญหาคณิตศาสตร์นี้ ให้ให้ขั้นตอนการใช้เหตุผลก่อนที่จะให้คำตอบสุดท้ายในบรรทัดสุดท้ายโดยอยู่ในรูปแบบ "คำตอบ:" ไม่ควรเพิ่มอะไรนอกจากคำตอบที่เป็นจำนวนเต็มหลังจาก "คำตอบ:" + +{input}""", + "zh": """解决这个数学问题。在最后一行给出答案前,请提供推理步骤。最后一行应该以 "答案: " 的形式独立给出答案。在 "答案:" 后不要添加除整数答案之外的任何内容。 + +{input}""", +} + +LANG_TO_ANSWER_PREFIX = { + "en": "Answer", + "bn": "উত্তর", + "de": "Antwort", + "es": "Respuesta", + "fr": "Réponse", + "ja": "答え", + "ru": "Ответ", + "sw": "Jibu", + "te": "సమాధానం", + "th": "คำตอบ", + "zh": "答案", +} + + +def parse_answer(answer: str, answer_prefix: str) -> str: + if answer_prefix not in answer: + return "" + + answer_text = answer.split(answer_prefix)[-1].strip() + + # find all the numbers (including decimals) in the string + numbers = re.findall(r"\d+\.?\d*", answer_text.replace(",", "")) + + # return the first number (removing trailing decimal point if present), + # or an empty string if there were no numbers + return numbers[-1].rstrip(".") if numbers else "" + + +def score_mgsm(target: str, prediction: str) -> bool: + if "." in prediction: + prediction = prediction.rstrip("0").rstrip(".") + + target = target.replace(",", "") + prediction = prediction.replace(",", "") + + return target == prediction + + +def get_lang_examples(lang: str) -> list[dict[str, str]]: + fpath = LANG_TO_FPATH[lang] + examples = [] + with urllib.request.urlopen(fpath) as f: + for line in f.read().decode("utf-8").splitlines(): + inputs, targets = line.strip().split("\t") + if "." in targets: + raise ValueError(f"targets {targets} contains a decimal point.") + # targets = int(targets.replace(",", "")) + examples.append({"inputs": inputs, "targets": targets, "lang": lang}) + return examples + + +def get_all_examples() -> list[dict[str, str]]: + examples = [] + for lang in ALL_LANGUAGES: + if lang != "en": + continue + examples += get_lang_examples(lang) + return examples + + +class MGSMEval(Eval): + def __init__( + self, + num_examples_per_lang: int = 250, # restrict to a subset of the data for debugging + num_threads: int = 64, + languages: Optional[list[str]] = ALL_LANGUAGES, + ): + if languages is None: + languages = ALL_LANGUAGES + else: + for language in languages: + if language not in ALL_LANGUAGES: + raise ValueError( + f"language {language} is not a valid language. " + f"It should be one in {ALL_LANGUAGES}" + ) + self._languages = languages + self._num_examples_per_lang = num_examples_per_lang + self._num_threads = num_threads + + examples = [] + for lang in self._languages: + lang_examples = get_lang_examples(lang) + examples.extend(lang_examples[: self._num_examples_per_lang]) + self.examples = examples + + def __call__(self, sampler: SamplerBase) -> EvalResult: + def fn(example: dict[str, str]): + language = example["lang"] + latin_language = ( + "group_latin" if language in LATIN_LANGUAGES else "group_non_latin" + ) + correct_answer = example["targets"] + instructoin = LANG_TO_INSTRUCTIONS[language] + prompt_messages = [ + sampler._pack_message( + content=instructoin.format(input=example["inputs"]), role="user" + ) + ] + try: + response_text = sampler(prompt_messages) + except Exception as e: + response_text = "" + + answer_prefix = LANG_TO_ANSWER_PREFIX[language] + extracted_answer = parse_answer(response_text, answer_prefix) + + score = score_mgsm(correct_answer, extracted_answer) + html = common.jinja_env.from_string(HTML_JINJA).render( + prompt_messages=prompt_messages, + next_message=dict(content=response_text, role="assistant"), + score=score, + correct_answer=correct_answer, + extracted_answer=extracted_answer, + ) + convo = prompt_messages + [dict(content=response_text, role="assistant")] + return SingleEvalResult( + html=html, + score=score, + convo=convo, + metrics={language: score, latin_language: score}, + ) + + results = common.map_with_progress( + fn, self.examples, num_threads=self._num_threads + ) + return common.aggregate_results(results, default_stats=("mean", "std")) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 08122389f9..c99b6a60be 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -7,7 +7,7 @@ "minimal": [ "test_chunked_prefill.py", "test_embedding_openai_server.py", - "test_eval_accuracy.py", + "test_eval_accuracy_mini.py", "test_large_max_new_tokens.py", "test_openai_server.py", "test_skip_tokenizer_init.py", diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 3a9423bc5b..5b2bb4aaab 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -10,34 +10,41 @@ ) -class TestAccuracy(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, +class TestChunkedPrefill(unittest.TestCase): + + def run_mmlu(self, disable_radix_cache): + other_args = ["--chunked-prefill-size", "32"] + if disable_radix_cache: + other_args += ["--disable-radix-cache"] + + model = DEFAULT_MODEL_NAME_FOR_TEST + base_url = DEFAULT_URL_FOR_TEST + process = popen_launch_server( + model, + base_url, timeout=300, - other_args=["--chunked-prefill-size", "32"], + other_args=other_args, ) - @classmethod - def tearDownClass(cls): - kill_child_process(cls.process.pid) - - def test_mmlu(self): args = SimpleNamespace( - base_url=self.base_url, - model=self.model, + base_url=base_url, + model=model, eval_name="mmlu", - num_examples=20, - num_threads=20, + num_examples=32, + num_threads=32, ) - metrics = run_eval(args) - assert metrics["score"] >= 0.5 + try: + metrics = run_eval(args) + assert metrics["score"] >= 0.6 + finally: + kill_child_process(process.pid) + + def test_chunked_prefill(self): + self.run_mmlu(disable_radix_cache=False) + + def test_chunked_prefill_without_radix_cache(self): + self.run_mmlu(disable_radix_cache=True) if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py new file mode 100644 index 0000000000..84a60dbe90 --- /dev/null +++ b/test/srt/test_eval_accuracy_large.py @@ -0,0 +1,68 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLarge(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = "http://127.0.0.1:7157" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + other_args=["--log-level-http", "warning"], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=None, + num_threads=2048, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.70 + + def test_human_eval(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="humaneval", + num_examples=None, + num_threads=2048, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.65 + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=2048, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.85 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy_mini.py similarity index 85% rename from test/srt/test_eval_accuracy.py rename to test/srt/test_eval_accuracy_mini.py index a3f16f857e..b5533da379 100644 --- a/test/srt/test_eval_accuracy.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -10,7 +10,7 @@ ) -class TestAccuracy(unittest.TestCase): +class TestEvalAccuracyMini(unittest.TestCase): @classmethod def setUpClass(cls): @@ -27,12 +27,12 @@ def test_mmlu(self): base_url=self.base_url, model=self.model, eval_name="mmlu", - num_examples=20, - num_threads=20, + num_examples=32, + num_threads=32, ) metrics = run_eval(args) - assert metrics["score"] >= 0.5 + assert metrics["score"] >= 0.6 if __name__ == "__main__": diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 808bc833ea..25b07d8816 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -1,3 +1,4 @@ +import os import unittest from types import SimpleNamespace @@ -55,21 +56,30 @@ def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size kill_child_process(process.pid) assert res["completed"] == num_prompts + return res def test_default(self): - self.run_test( + res = self.run_test( disable_radix_cache=False, disable_flashinfer=False, chunked_prefill_size=-1, ) + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 performance + assert res["output_throughput"] >= 1300 + def test_default_without_radix_cache(self): - self.run_test( + res = self.run_test( disable_radix_cache=True, disable_flashinfer=False, chunked_prefill_size=-1, ) + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 performance + assert res["output_throughput"] >= 1400 + def test_default_without_flashinfer(self): self.run_test( disable_radix_cache=False, diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index c8869a9cca..1ea1438fee 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -10,7 +10,7 @@ ) -class TestAccuracy(unittest.TestCase): +class TestTorchCompile(unittest.TestCase): @classmethod def setUpClass(cls): @@ -29,12 +29,12 @@ def test_mmlu(self): base_url=self.base_url, model=self.model, eval_name="mmlu", - num_examples=20, - num_threads=20, + num_examples=32, + num_threads=32, ) metrics = run_eval(args) - assert metrics["score"] >= 0.5 + assert metrics["score"] >= 0.6 if __name__ == "__main__": From 0c1c72a0b409f255a1fcea666705af8140da5f1e Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 Aug 2024 02:48:40 -0700 Subject: [PATCH 003/118] Fix accuracy test (#1051) --- python/sglang/test/run_eval.py | 3 ++- python/sglang/test/simple_eval_humaneval.py | 10 ++-------- test/srt/test_eval_accuracy_large.py | 14 +++++++------- test/srt/test_serving_throughput.py | 8 ++++---- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index 3d13d475b0..51b32ca01b 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -16,6 +16,8 @@ def run_eval(args): + set_ulimit() + if "OPENAI_API_KEY" not in os.environ: os.environ["OPENAI_API_KEY"] = "EMPTY" @@ -117,7 +119,6 @@ def run_eval(args): parser.add_argument("--eval-name", type=str, default="mmlu") parser.add_argument("--num-examples", type=int) parser.add_argument("--num-threads", type=int, default=512) - set_ulimit() args = parser.parse_args() run_eval(args) diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py index 7a0f90c467..efb0d0bd6f 100644 --- a/python/sglang/test/simple_eval_humaneval.py +++ b/python/sglang/test/simple_eval_humaneval.py @@ -6,21 +6,15 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/ """ -import json -import logging -import multiprocessing import random import re -from collections import Counter, defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed -from io import BytesIO -from typing import Any, Dict, List, Tuple +from typing import Dict, List -import blobfile as bf import tqdm try: - from human_eval.data import HUMAN_EVAL, read_problems + from human_eval.data import read_problems from human_eval.evaluation import estimate_pass_at_k from human_eval.execution import check_correctness # , unsafe_execute except (ImportError, ModuleNotFoundError): diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 84a60dbe90..5569543313 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -32,12 +32,12 @@ def test_mmlu(self): base_url=self.base_url, model=self.model, eval_name="mmlu", - num_examples=None, - num_threads=2048, + num_examples=3000, + num_threads=1024, ) metrics = run_eval(args) - assert metrics["score"] >= 0.70 + assert metrics["score"] >= 0.71, f"{metrics}" def test_human_eval(self): args = SimpleNamespace( @@ -45,11 +45,11 @@ def test_human_eval(self): model=self.model, eval_name="humaneval", num_examples=None, - num_threads=2048, + num_threads=1024, ) metrics = run_eval(args) - assert metrics["score"] >= 0.65 + assert metrics["score"] >= 0.65, f"{metrics}" def test_mgsm_en(self): args = SimpleNamespace( @@ -57,11 +57,11 @@ def test_mgsm_en(self): model=self.model, eval_name="mgsm_en", num_examples=None, - num_threads=2048, + num_threads=1024, ) metrics = run_eval(args) - assert metrics["score"] >= 0.85 + assert metrics["score"] >= 0.85, f"{metrics}" if __name__ == "__main__": diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 25b07d8816..0066d01cb2 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -66,8 +66,8 @@ def test_default(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 performance - assert res["output_throughput"] >= 1300 + # A100 (PCIE) performance + assert res["output_throughput"] >= 1400 def test_default_without_radix_cache(self): res = self.run_test( @@ -77,8 +77,8 @@ def test_default_without_radix_cache(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 performance - assert res["output_throughput"] >= 1400 + # A100 (PCIE) performance + assert res["output_throughput"] >= 1450 def test_default_without_flashinfer(self): self.run_test( From c877292cc12a61011694d7d0ea53c05f247003f6 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 Aug 2024 03:39:01 -0700 Subject: [PATCH 004/118] Re-organize CI tests (#1052) --- .github/workflows/e2e-test.yml | 5 +- .../sglang/srt/constrained/base_tool_cache.py | 2 +- python/sglang/srt/managers/tp_worker.py | 17 +++-- python/sglang/srt/mem_cache/chunk_cache.py | 2 +- python/sglang/srt/server.py | 9 +++ test/srt/run_suite.py | 1 + ...est_eval_accuracy_large_chunked_prefill.py | 68 +++++++++++++++++++ test/srt/test_serving_throughput.py | 25 ++++--- test/srt/test_triton_attn_backend.py | 41 +++++++++++ 9 files changed, 148 insertions(+), 22 deletions(-) create mode 100644 test/srt/test_eval_accuracy_large_chunked_prefill.py create mode 100644 test/srt/test_triton_attn_backend.py diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 336f6a14f7..455594bd72 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -45,8 +45,7 @@ jobs: cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache - - name: Benchmark Serving Throughput (w/o FlashInfer) + - name: Benchmark Serving Throughput (w/ ChunkedPrefill) run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer - + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill diff --git a/python/sglang/srt/constrained/base_tool_cache.py b/python/sglang/srt/constrained/base_tool_cache.py index 4cbb6bd226..fa1aff5eac 100644 --- a/python/sglang/srt/constrained/base_tool_cache.py +++ b/python/sglang/srt/constrained/base_tool_cache.py @@ -54,7 +54,7 @@ def _init_with_timer(key): return val def init_value(self, key): - raise NotImplementedError + raise NotImplementedError() def get_cache_hit_rate(self): if self.metrics["total"] == 0: diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index f148852630..a8b952361d 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -410,13 +410,16 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: # Print stats if self.tp_rank == 0: - self.tree_cache_metrics["total"] += ( - adder.log_input_tokens + adder.log_hit_tokens - ) / 10**9 - self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 - tree_cache_hit_rate = ( - self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] - ) + if isinstance(self.tree_cache, RadixCache): + self.tree_cache_metrics["total"] += ( + adder.log_input_tokens + adder.log_hit_tokens + ) / 10**9 + self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 + tree_cache_hit_rate = ( + self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] + ) + else: + tree_cache_hit_rate = 0.0 logger.info( f"[gpu={self.gpu_id}] Prefill batch. " f"#new-seq: {len(can_run_list)}, " diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py index 35b9171e5b..e7e48ecee4 100644 --- a/python/sglang/srt/mem_cache/chunk_cache.py +++ b/python/sglang/srt/mem_cache/chunk_cache.py @@ -68,7 +68,7 @@ def cache_unfinished_req(self, req: Req, token_ids: Optional[List[int]] = None): req.last_node = entry def insert(self): - raise NotImplementedError + raise NotImplementedError() def evict(self, num_tokens: int, evict_callback: Callable): pass diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 8b67663357..7331425fae 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -447,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer): print(f"Initialization failed. warmup error: {last_traceback}", flush=True) sys.exit(1) + # Print warnings here + if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None: + logger.warning( + "You set both `--disable-radix-cache` and `--chunked-prefill-size`. " + "This combination is an experimental feature and we noticed it can lead to " + "wrong generation results. If you want to use chunked prefill, it is recommended " + "not using `--disable-radix-cache`." + ) + logger.info("The server is fired up and ready to roll!") if pipe_finish_writer is not None: pipe_finish_writer.send("init ok") diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index c99b6a60be..4d3f7de30a 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -12,6 +12,7 @@ "test_openai_server.py", "test_skip_tokenizer_init.py", "test_torch_compile.py", + "test_triton_attn_backend.py", "test_vision_openai_server.py", "test_large_max_new_tokens.py", "models/test_generation_models.py", diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py new file mode 100644 index 0000000000..297fc22e1d --- /dev/null +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -0,0 +1,68 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = "http://127.0.0.1:7157" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=3000, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.71, f"{metrics}" + + def test_human_eval(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="humaneval", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.65, f"{metrics}" + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.85, f"{metrics}" + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 0066d01cb2..c733163f5a 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -3,6 +3,7 @@ from types import SimpleNamespace from sglang.bench_serving import run_benchmark +from sglang.srt.server_args import ServerArgs from sglang.srt.utils import kill_child_process from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server @@ -60,9 +61,9 @@ def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size def test_default(self): res = self.run_test( - disable_radix_cache=False, - disable_flashinfer=False, - chunked_prefill_size=-1, + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": @@ -72,21 +73,25 @@ def test_default(self): def test_default_without_radix_cache(self): res = self.run_test( disable_radix_cache=True, - disable_flashinfer=False, - chunked_prefill_size=-1, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance assert res["output_throughput"] >= 1450 - def test_default_without_flashinfer(self): - self.run_test( - disable_radix_cache=False, - disable_flashinfer=True, - chunked_prefill_size=-1, + def test_default_with_chunked_prefill(self): + res = self.run_test( + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=8192, ) + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + assert res["output_throughput"] >= 1400 + def test_all_cases(self): for disable_radix_cache in [False, True]: for disable_flashinfer in [False, True]: diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py new file mode 100644 index 0000000000..67cbc623c3 --- /dev/null +++ b/test/srt/test_triton_attn_backend.py @@ -0,0 +1,41 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestTritonAttnBackend(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"] + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=32, + num_threads=32, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.6 + + +if __name__ == "__main__": + unittest.main() From b0ad0c1bc8787937a7df5bc0487af1e9db6efb5e Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Mon, 12 Aug 2024 18:59:38 +0800 Subject: [PATCH 005/118] chore: bump v0.2.12 (#1048) --- README.md | 2 +- python/pyproject.toml | 2 +- python/sglang/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1d7ff009b7..59f72bf125 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ### Method 2: From source ``` # Use the last release branch -git clone -b v0.2.11 https://github.com/sgl-project/sglang.git +git clone -b v0.2.12 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip diff --git a/python/pyproject.toml b/python/pyproject.toml index 32d4912a3a..22a906e8aa 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.2.11" +version = "0.2.12" description = "SGLang is yet another fast serving framework for large language models and vision language models." readme = "README.md" requires-python = ">=3.8" diff --git a/python/sglang/version.py b/python/sglang/version.py index 5635676f6b..b5c9b6cb71 100644 --- a/python/sglang/version.py +++ b/python/sglang/version.py @@ -1 +1 @@ -__version__ = "0.2.11" +__version__ = "0.2.12" From 6a38efa8342ef4b924b093d90260ead6d1f6cea7 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 00:15:59 +0800 Subject: [PATCH 006/118] feat: replace all rmsnorm and silu (#1057) --- python/sglang/srt/models/chatglm.py | 4 ++-- python/sglang/srt/models/commandr.py | 2 +- python/sglang/srt/models/deepseek.py | 4 ++-- python/sglang/srt/models/deepseek_v2.py | 4 ++-- python/sglang/srt/models/gemma.py | 2 +- python/sglang/srt/models/grok.py | 2 +- python/sglang/srt/models/minicpm.py | 4 ++-- python/sglang/srt/models/mixtral.py | 2 +- python/sglang/srt/models/mixtral_quant.py | 2 +- python/sglang/srt/models/qwen.py | 4 ++-- python/sglang/srt/models/qwen2.py | 4 ++-- python/sglang/srt/models/qwen2_moe.py | 4 ++-- python/sglang/srt/models/stablelm.py | 2 +- 13 files changed, 20 insertions(+), 20 deletions(-) diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index d2ad02fbf4..0a22f994bb 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -24,8 +24,6 @@ from torch.nn import LayerNorm from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -43,6 +41,8 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index 1259285c46..f6d6f6e1f9 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -50,7 +50,6 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -62,6 +61,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.utils import set_weight_attrs +from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index 98dcfd28df..59fd1ec7ed 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -27,9 +27,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -44,6 +42,8 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 739562730b..2198428b85 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -26,9 +26,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, @@ -43,6 +41,8 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.managers.schedule_batch import global_server_args_dict diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index ce39731156..990937f518 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -24,7 +24,6 @@ from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -35,6 +34,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 38297b7d6e..13d4330d4c 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -31,7 +31,6 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, ReplicatedLinear, @@ -50,6 +49,7 @@ from vllm.utils import print_warning_once from sglang.srt.layers.fused_moe import fused_moe +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index bf572855e6..49ff1926f3 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -22,8 +22,6 @@ from torch import nn from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -37,6 +35,8 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 63053ac50b..876c7a09d4 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -31,7 +31,6 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, ReplicatedLinear, @@ -48,6 +47,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.utils import print_warning_once +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index 07caf38334..115fce1d6d 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -29,7 +29,6 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, ReplicatedLinear, @@ -43,6 +42,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index ffc512b1ca..93dae9585c 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -22,8 +22,6 @@ from transformers import PretrainedConfig from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -37,6 +35,8 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index dec962bf0a..d1295bd8cc 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -22,8 +22,6 @@ from torch import nn from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -37,6 +35,8 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index f96f7e0e48..9bdbd75066 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -28,9 +28,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -47,6 +45,8 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index aeaa46ab12..9e10f12f2a 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -24,7 +24,6 @@ from transformers import PretrainedConfig from vllm.config import CacheConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -38,6 +37,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata From 65e89baea9f152837f32ce8b0baa5b877bf39a5c Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 13:12:56 +0800 Subject: [PATCH 007/118] fix: not use the default port (#1068) --- python/sglang/test/runners.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index e5ad3ea9d3..fadd56e8c2 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -174,6 +174,7 @@ def __init__( tp_size=1, torch_dtype=torch.float16, is_generation_model=None, + port=5157, ): self.is_generation_model = ( is_generation_model(model_path) @@ -184,6 +185,7 @@ def __init__( model_path=model_path, tp_size=tp_size, dtype=get_dtype_str(torch_dtype), + port=port, ) def forward( From 162f3ccb01d9b31d21f1a1ae3d6cabbfe4079838 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Tue, 13 Aug 2024 13:48:07 +0800 Subject: [PATCH 008/118] Fix layernorm input shape (#1066) Co-authored-by: Yineng Zhang --- python/sglang/srt/models/deepseek_v2.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 2198428b85..13dd477392 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -445,11 +445,12 @@ def forward( q_nope_out = q_input[..., : self.kv_lora_rank] torch.bmm(q_nope.transpose(0, 1), self.w_kc, out=q_nope_out.transpose(0, 1)) - k_input = self.kv_a_proj_with_mqa(hidden_states)[0].unsqueeze(1) - k_pe = k_input[..., self.kv_lora_rank :] - v_input = k_input[..., : self.kv_lora_rank] - v_input = self.kv_a_layernorm(v_input.contiguous()) + latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] + v_input = latent_cache[..., : self.kv_lora_rank] + v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1) + k_input = latent_cache.unsqueeze(1) k_input[..., : self.kv_lora_rank] = v_input + k_pe = k_input[..., self.kv_lora_rank :] q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) q_input[..., self.kv_lora_rank :] = q_pe From 65915f9f3e93a0f682c97fe8ece268f2f2c00fa5 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 13:48:54 +0800 Subject: [PATCH 009/118] fix: temporary solution for DeepSeek V2 H100 layout conversion issue (#1060) Co-authored-by: ispobock --- python/sglang/srt/layers/extend_attention.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/extend_attention.py b/python/sglang/srt/layers/extend_attention.py index 0a03f65626..097adca3ca 100644 --- a/python/sglang/srt/layers/extend_attention.py +++ b/python/sglang/srt/layers/extend_attention.py @@ -275,7 +275,9 @@ def extend_attention_fwd( BLOCK_DPE = 0 BLOCK_DV = Lv - if CUDA_CAPABILITY[0] >= 8: + if CUDA_CAPABILITY[0] >= 9: + BLOCK_M, BLOCK_N = (128, 64) + elif CUDA_CAPABILITY[0] >= 8: BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64) else: BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32) From 396a13e6ad6b62f850aac026e4ddc57134e5f4e7 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 16:16:50 +0800 Subject: [PATCH 010/118] ci: add cancel pr workflow (#1070) --- .github/workflows/cancel-pr-workflow.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/cancel-pr-workflow.yml diff --git a/.github/workflows/cancel-pr-workflow.yml b/.github/workflows/cancel-pr-workflow.yml new file mode 100644 index 0000000000..d4709dc463 --- /dev/null +++ b/.github/workflows/cancel-pr-workflow.yml @@ -0,0 +1,22 @@ +name: Cancel PR Workflows on Merge + +on: + pull_request: + types: + - closed + +permissions: + actions: write + +jobs: + cancel: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.12.1 + with: + workflow_id: all + access_token: ${{ secrets.GITHUB_TOKEN }} + ignore_sha: true + pr_number: ${{ github.event.pull_request.number }} From f7fb68d2925201ce234e97d81ad3095e4dc48cbb Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 16:43:23 +0800 Subject: [PATCH 011/118] ci: add moe test (#1053) --- .github/workflows/moe-test.yml | 42 +++++++ python/sglang/test/test_utils.py | 6 +- test/srt/test_chunked_prefill.py | 5 +- test/srt/test_embedding_openai_server.py | 5 +- test/srt/test_eval_accuracy_large.py | 10 +- ...est_eval_accuracy_large_chunked_prefill.py | 10 +- test/srt/test_eval_accuracy_mini.py | 5 +- test/srt/test_large_max_new_tokens.py | 5 +- test/srt/test_moe_serving_throughput.py | 112 ++++++++++++++++++ test/srt/test_openai_server.py | 5 +- test/srt/test_serving_throughput.py | 9 +- test/srt/test_skip_tokenizer_init.py | 5 +- test/srt/test_srt_endpoint.py | 5 +- test/srt/test_torch_compile.py | 5 +- test/srt/test_triton_attn_backend.py | 5 +- test/srt/test_vision_openai_server.py | 5 +- 16 files changed, 195 insertions(+), 44 deletions(-) create mode 100644 .github/workflows/moe-test.yml create mode 100644 test/srt/test_moe_serving_throughput.py diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml new file mode 100644 index 0000000000..a781f2eff8 --- /dev/null +++ b/.github/workflows/moe-test.yml @@ -0,0 +1,42 @@ +name: MoE Test + +on: + push: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + pull_request: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + workflow_dispatch: + +concurrency: + group: moe-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + moe-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: accuracy + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + source $HOME/venv/bin/activate + echo "$HOME/venv/bin" >> $GITHUB_PATH + + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark MOE Serving Throughput + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 7243ff2ecd..66f3e4f35e 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -21,7 +21,11 @@ from sglang.utils import get_exception_traceback DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" -DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157" +DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" +DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" +DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" +DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" +DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157" def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 5b2bb4aaab..94c4247624 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -5,20 +5,19 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestChunkedPrefill(unittest.TestCase): - def run_mmlu(self, disable_radix_cache): other_args = ["--chunked-prefill-size", "32"] if disable_radix_cache: other_args += ["--disable-radix-cache"] model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST + base_url = DEFAULT_URL_FOR_UNIT_TEST process = popen_launch_server( model, base_url, diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py index 45580feda0..fd8fec48e9 100644 --- a/test/srt/test_embedding_openai_server.py +++ b/test/srt/test_embedding_openai_server.py @@ -4,15 +4,14 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server class TestOpenAIServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = "intfloat/e5-mistral-7b-instruct" - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 5569543313..9f99b0b95d 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -5,17 +5,17 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_ACCURACY_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestEvalAccuracyLarge(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:7157" + cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST cls.process = popen_launch_server( cls.model, cls.base_url, @@ -49,7 +49,7 @@ def test_human_eval(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.65, f"{metrics}" + assert metrics["score"] >= 0.64, f"{metrics}" def test_mgsm_en(self): args = SimpleNamespace( @@ -61,7 +61,7 @@ def test_mgsm_en(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.85, f"{metrics}" + assert metrics["score"] >= 0.84, f"{metrics}" if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index 297fc22e1d..040a2db75f 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -5,17 +5,17 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_ACCURACY_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:7157" + cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST cls.process = popen_launch_server( cls.model, cls.base_url, @@ -49,7 +49,7 @@ def test_human_eval(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.65, f"{metrics}" + assert metrics["score"] >= 0.64, f"{metrics}" def test_mgsm_en(self): args = SimpleNamespace( @@ -61,7 +61,7 @@ def test_mgsm_en(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.85, f"{metrics}" + assert metrics["score"] >= 0.84, f"{metrics}" if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py index b5533da379..a4219b1a0a 100644 --- a/test/srt/test_eval_accuracy_mini.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -5,17 +5,16 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestEvalAccuracyMini(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index 58f82b3516..f29adabced 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -10,17 +10,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestOpenAIServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py new file mode 100644 index 0000000000..6353e5099a --- /dev/null +++ b/test/srt/test_moe_serving_throughput.py @@ -0,0 +1,112 @@ +import os +import unittest +from types import SimpleNamespace + +from sglang.bench_serving import run_benchmark +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_MOE_TEST, + popen_launch_server, +) + + +class TestServingThroughput(unittest.TestCase): + def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size): + # Launch the server + other_args = [] + if disable_radix_cache: + other_args.append("--disable-radix-cache") + if disable_flashinfer: + other_args.append("--disable-flashinfer") + other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) + other_args.extend(["--tensor-parallel-size", "2"]) + other_args.append("--enable-p2p-check") + + model = DEFAULT_MOE_MODEL_NAME_FOR_TEST + base_url = DEFAULT_URL_FOR_MOE_TEST + process = popen_launch_server( + model, base_url, timeout=300, other_args=other_args + ) + + # Run benchmark + num_prompts = 400 + args = SimpleNamespace( + backend="sglang", + base_url=base_url, + host=None, + port=None, + dataset_name="random", + dataset_path="", + model=None, + tokenizer=None, + num_prompts=num_prompts, + sharegpt_output_len=None, + random_input_len=4096, + random_output_len=2048, + random_range_ratio=0.0, + request_rate=float("inf"), + multi=None, + seed=0, + output_file=None, + disable_tqdm=False, + disable_stream=False, + disable_ignore_eos=False, + extra_request_body=None, + ) + + try: + res = run_benchmark(args) + finally: + kill_child_process(process.pid) + + assert res["completed"] == num_prompts + return res + + def test_default(self): + res = self.run_test( + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + assert res["output_throughput"] > 950 + + def test_default_without_radix_cache(self): + res = self.run_test( + disable_radix_cache=True, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + assert res["output_throughput"] > 950 + + def test_default_with_chunked_prefill(self): + res = self.run_test( + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=8192, + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + print(res["output_throughput"]) + + def test_all_cases(self): + for disable_radix_cache in [False, True]: + for disable_flashinfer in [False, True]: + for chunked_prefill_size in [-1, 2048]: + self.run_test( + disable_radix_cache=False, + disable_flashinfer=False, + chunked_prefill_size=-1, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index b66c35f01d..8724247564 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -8,17 +8,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestOpenAIServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index c733163f5a..c99d2e07e2 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -5,11 +5,14 @@ from sglang.bench_serving import run_benchmark from sglang.srt.server_args import ServerArgs from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_E2E_TEST, + popen_launch_server, +) class TestServingThroughput(unittest.TestCase): - def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size): # Launch the server other_args = [] @@ -20,7 +23,7 @@ def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = "http://127.0.0.1:9157" + base_url = DEFAULT_URL_FOR_E2E_TEST process = popen_launch_server( model, base_url, timeout=300, other_args=other_args ) diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 01bfdb96a3..7501056151 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -6,17 +6,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestSkipTokenizerInit(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"] ) diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 2c40f53602..5e6bcbf60a 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -6,17 +6,16 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestSRTEndpoint(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 1ea1438fee..5133d3cd3c 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -5,17 +5,16 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestTorchCompile(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] ) diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py index 67cbc623c3..7a453d8be7 100644 --- a/test/srt/test_triton_attn_backend.py +++ b/test/srt/test_triton_attn_backend.py @@ -5,17 +5,16 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) class TestTritonAttnBackend(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"] ) diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0449e33f1b..c599d8b368 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -5,15 +5,14 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server class TestOpenAIVisionServer(unittest.TestCase): - @classmethod def setUpClass(cls): cls.model = "liuhaotian/llava-v1.6-vicuna-7b" - cls.base_url = DEFAULT_URL_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, From 0076f1154160f53a6c5de8a3716783071f6ef617 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 19:08:43 +0800 Subject: [PATCH 012/118] fix: use devel for Triton's compiler requirements (#1074) --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 95127b33a9..2f7541c9a4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,5 @@ ARG CUDA_VERSION=12.1.1 -FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu20.04 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_TYPE=all ENV DEBIAN_FRONTEND=noninteractive From cebd78d83ee193b1d35f0591e7beb62f2b944b8e Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 13 Aug 2024 20:12:58 +0800 Subject: [PATCH 013/118] ci: add accuracy timeout (#1078) --- .github/workflows/accuracy-test.yml | 1 + test/srt/test_moe_serving_throughput.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 9c8e7bfeb4..16bb584f4a 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -43,3 +43,4 @@ jobs: run: | cd test/srt python3 test_eval_accuracy_large.py + timeout-minutes: 20 diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 6353e5099a..da223e80b9 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -31,7 +31,7 @@ def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size ) # Run benchmark - num_prompts = 400 + num_prompts = 200 args = SimpleNamespace( backend="sglang", base_url=base_url, From 95f5fbf1a75f4256cedb35da5c2e38f7841d0ba4 Mon Sep 17 00:00:00 2001 From: rainred <107027757+gryffindor-rr@users.noreply.github.com> Date: Tue, 13 Aug 2024 20:47:22 +0800 Subject: [PATCH 014/118] Fix create_abort_task, GenerateReqInput does not have rids. (#1079) Co-authored-by: lzhang --- python/sglang/srt/managers/tokenizer_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index e1bfbc7e67..d5fbfe05d3 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -507,7 +507,7 @@ async def abort_request(): if obj.is_single: self.abort_request(obj.rid) else: - for rid in obj.rids: + for rid in obj.rid: self.abort_request(rid) background_tasks = BackgroundTasks() From 312e8492556dd092368452f349ed45af3e3a68b6 Mon Sep 17 00:00:00 2001 From: Lucien Date: Wed, 14 Aug 2024 06:07:57 +0800 Subject: [PATCH 015/118] Example file for docker compose and k8s (#1006) --- README.md | 10 ++++- docker/compose.yaml | 31 ++++++++++++++ docker/k8s-sglang-service.yaml | 76 ++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 docker/compose.yaml create mode 100644 docker/k8s-sglang-service.yaml diff --git a/README.md b/README.md index 59f72bf125..117c329bb0 100644 --- a/README.md +++ b/README.md @@ -76,9 +76,17 @@ docker run --gpus all \ --env "HF_TOKEN=" \ --ipc=host \ lmsysorg/sglang:latest \ - python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000 + python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ``` +### Method 4: Using docker compose + +> This method is recommended if you plan to serve it as a service. +> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml). + +1. Copy the [compose.yml](./docker/compose.yaml) to your local machine +2. Execute the command `docker compose up -d` in your terminal. + ### Common Notes - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server. - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. diff --git a/docker/compose.yaml b/docker/compose.yaml new file mode 100644 index 0000000000..f2da3a416a --- /dev/null +++ b/docker/compose.yaml @@ -0,0 +1,31 @@ +services: + sglang: + image: lmsysorg/sglang:latest + container_name: sglang + volumes: + - ${HOME}/.cache/huggingface:/root/.cache/huggingface + restart: always + network_mode: host + # Or you can only publish port 30000 + # ports: + # - 30000:30000 + environment: + HF_TOKEN: + entrypoint: python3 -m sglang.launch_server + command: + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct + --host 0.0.0.0 + --port 30000 + ulimits: + memlock: -1 + stack: 67108864 + ipc: host + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml new file mode 100644 index 0000000000..c217f356af --- /dev/null +++ b/docker/k8s-sglang-service.yaml @@ -0,0 +1,76 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: meta-llama-31-8b-instruct-sglang +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: meta-llama-31-8b-instruct-sglang + template: + metadata: + labels: + app: meta-llama-31-8b-instruct-sglang + model: meta-llama-31-8b-instruct + engine: sglang + spec: + hostIPC: true + restartPolicy: Always + runtimeClassName: nvidia + containers: + - name: meta-llama-31-8b-instruct-sglang + image: docker.io/lmsysorg/sglang:latest + imagePullPolicy: Always # IfNotPresent or Never + ports: + - containerPort: 30000 + command: ["python3", "-m", "sglang.launch_server"] + args: ["--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] + env: + - name: HF_TOKEN + value: + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: hf-cache + mountPath: /root/.cache/huggingface + readOnly: true + - name: localtime + mountPath: /etc/localtime + readOnly: true + livenessProbe: + httpGet: + path: /health + port: 30000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumes: + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + - name: localtime + hostPath: + path: /etc/localtime + type: File +--- +apiVersion: v1 +kind: Service +metadata: + name: meta-llama-31-8b-instruct-sglang +spec: + selector: + app: meta-llama-31-8b-instruct-sglang + ports: + - protocol: TCP + port: 30000 # port on host + targetPort: 30000 # port in container + type: LoadBalancer From ad3e4f16199a51862d72845f5f7ea53cc92442d2 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 13 Aug 2024 15:44:25 -0700 Subject: [PATCH 016/118] Update the mixtral to use the better FusedMoE layer (#1081) --- docs/en/model_support.md | 2 +- python/sglang/srt/models/mixtral.py | 308 ++++------------------ python/sglang/srt/models/mixtral_quant.py | 3 - test/srt/test_moe_serving_throughput.py | 2 +- 4 files changed, 57 insertions(+), 258 deletions(-) diff --git a/docs/en/model_support.md b/docs/en/model_support.md index e46e99e85c..1d720acf5c 100644 --- a/docs/en/model_support.md +++ b/docs/en/model_support.md @@ -5,7 +5,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa Another valuable resource is the [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models). vLLM has extensive coverage of models, and SGLang has reused vLLM for most parts of the model implementations. This similarity makes it easy to port many models from vLLM to SGLang. To port a model from vLLM to SGLang, you can compare these two files [SGLang LLaMA Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama2.py) and [vLLM LLaMA Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). This comparison will help you understand how to convert a model implementation from vLLM to SGLang. The major difference is the replacement of PagedAttention with RadixAttention. The other parts are almost identical. Specifically, - - Replace vllm's `Attention` with `RadixAttention`. + - Replace vllm's `Attention` with `RadixAttention`. Note that you need to pass `layer_id` all the way to `RadixAttention`. - Replace vllm's `LogitsProcessor` with SGLang's `LogitsProcessor`. - Remove `Sample`. - Change `forward()` functions, and add `input_metadata`. diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 876c7a09d4..d11f6c9519 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -18,34 +18,25 @@ """Inference-only Mixtral model.""" from typing import Iterable, Optional, Tuple -import numpy as np import torch -import torch.nn.functional as F from torch import nn from transformers import MixtralConfig -from vllm import _custom_ops as ops from vllm.config import CacheConfig -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, -) -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, ReplicatedLinear, RowParallelLinear, ) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -from vllm.model_executor.layers.quantization.fp8 import Fp8Config from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import print_warning_once from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor @@ -69,216 +60,44 @@ def __init__( hidden_size: int, intermediate_size: int, params_dtype: Optional[torch.dtype] = None, - tp_size: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, + tp_size: Optional[int] = None, + prefix: str = "", ): super().__init__() - self.tp_size = tp_size or get_tensor_model_parallel_world_size() - self.num_total_experts = num_experts - self.top_k = top_k self.hidden_size = hidden_size - self.intermediate_size = intermediate_size // self.tp_size - self.quant_config = quant_config - - # FIXME(pcmoritz): Make this more general to support different - # quantization schemes - self.use_fp8 = isinstance(quant_config, Fp8Config) - - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype # Gate always runs at half / full precision for now. self.gate = ReplicatedLinear( - self.hidden_size, - self.num_total_experts, + hidden_size, + num_experts, bias=False, - params_dtype=self.params_dtype, + params_dtype=params_dtype, quant_config=None, + prefix=f"{prefix}.gate", ) - if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized: - params_dtype = torch.float8_e4m3fn - - self.w13_weight = nn.Parameter( - torch.empty( - self.num_total_experts, - 2 * self.intermediate_size, - self.hidden_size, - dtype=params_dtype, - ) - ) - self.w2_weight = nn.Parameter( - torch.empty( - self.num_total_experts, - self.hidden_size, - self.intermediate_size, - dtype=params_dtype, - ) - ) - - set_weight_attrs( - self.w13_weight, - { - "weight_loader": self.weight_loader, - }, - ) - set_weight_attrs( - self.w2_weight, - { - "weight_loader": self.weight_loader, - }, + self.experts = FusedMoE( + num_experts=num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + params_dtype=params_dtype, + reduce_results=True, + renormalize=True, + quant_config=quant_config, + tp_size=tp_size, + prefix=f"{prefix}.experts", ) - # Used for fp8. - self.w13_scale = None - self.w2_scale = None - self.a13_scale = None - self.a2_scale = None - - if self.use_fp8: - # WEIGHT_SCALE (for fp8) - self.w13_scale = nn.Parameter( - torch.ones(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - self.w2_scale = nn.Parameter( - torch.ones(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - - # If loading fp8 checkpoint, pass the weight loaders. - # If loading an fp16 checkpoint, do not (we will quantize in - # process_weights_after_loading() - if quant_config.is_checkpoint_fp8_serialized: - set_weight_attrs( - self.w13_scale, - { - "weight_loader": self.weight_loader, - }, - ) - set_weight_attrs( - self.w2_scale, - { - "weight_loader": self.weight_loader, - }, - ) - - # ACT_SCALE (for fp8) - if quant_config.activation_scheme == "static": - if not quant_config.is_checkpoint_fp8_serialized: - raise ValueError( - "Found static activation scheme for checkpoint that " - "was not serialized fp8." - ) - self.a13_scale = nn.Parameter( - torch.zeros(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - self.a2_scale = nn.Parameter( - torch.zeros(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - - set_weight_attrs( - self.a13_scale, - { - "weight_loader": self.weight_loader, - }, - ) - set_weight_attrs( - self.a2_scale, - { - "weight_loader": self.weight_loader, - }, - ) - - def weight_loader( - self, - param: nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - expert_id: int, - ): - tp_rank = get_tensor_model_parallel_rank() - param_data = param.data - shard_size = self.intermediate_size - shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) - if weight_name.endswith("w1.weight"): - param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] - if weight_name.endswith("w3.weight"): - param_data[expert_id, shard_size : 2 * shard_size, :] = loaded_weight[ - shard, : - ] - if weight_name.endswith("w2.weight"): - param_data[expert_id, :, :] = loaded_weight[:, shard] - if "act_scale" in weight_name or "weight_scale" in weight_name: - param_data[expert_id] = loaded_weight - - def process_weights_after_loading(self): - # Fp8 is the only case where we need to process after loading. - if not self.use_fp8: - return - - # If checkpoint is fp16, quantize here. - if not self.quant_config.is_checkpoint_fp8_serialized: - w13_weight = torch.empty_like( - self.w13_weight.data, dtype=torch.float8_e4m3fn - ) - w2_weight = torch.empty_like(self.w2_weight.data, dtype=torch.float8_e4m3fn) - for expert in range(self.num_total_experts): - w13_weight[expert, :, :], self.w13_scale[expert] = ops.scaled_fp8_quant( - self.w13_weight.data[expert, :, :] - ) - w2_weight[expert, :, :], self.w2_scale[expert] = ops.scaled_fp8_quant( - self.w2_weight.data[expert, :, :] - ) - self.w13_weight = nn.Parameter(w13_weight, requires_grad=False) - self.w2_weight = nn.Parameter(w2_weight, requires_grad=False) - - # If checkpoint is fp8 + static, cleanup act_scales. - # Since state_dict has an act_scale per expert but our kernels - # are passed one act_scale shared across all experts. - elif self.quant_config.activation_scheme == "static": - if self.a13_scale is None or self.a2_scale is None: - raise ValueError( - "QuantConfig has static quantization, but found " - "activation scales are None." - ) - - if not all_close_1d(self.a13_scale) or not all_close_1d(self.a2_scale): - print_warning_once( - "Found act_scales that are not equal for fp8 MoE layer. " - "Using the maximum across experts for each layer. " - ) - - self.a13_scale = nn.Parameter(self.a13_scale.max(), requires_grad=False) - self.a2_scale = nn.Parameter(self.a2_scale.max(), requires_grad=False) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_size = hidden_states.shape + # NOTE: hidden_states can have either 1D or 2D shape. + orig_shape = hidden_states.shape hidden_states = hidden_states.view(-1, self.hidden_size) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe( - hidden_states, - self.w13_weight, - self.w2_weight, - router_logits, - self.top_k, - renormalize=True, - inplace=True, - use_fp8=self.use_fp8, - w1_scale=self.w13_scale, - w2_scale=self.w2_scale, - a1_scale=self.a13_scale, - a2_scale=self.a2_scale, - ) - - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) - - return final_hidden_states.view(num_tokens, hidden_size) + final_hidden_states = self.experts(hidden_states, router_logits) + return final_hidden_states.view(orig_shape) class MixtralAttention(nn.Module): @@ -291,7 +110,7 @@ def __init__( max_position: int = 4096 * 32, rope_theta: float = 10000, quant_config: Optional[QuantizationConfig] = None, - sliding_window: Optional[int] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -314,7 +133,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.sliding_window = sliding_window self.qkv_proj = QKVParallelLinear( hidden_size, @@ -323,12 +141,14 @@ def __init__( self.total_num_kv_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( self.head_dim, @@ -365,6 +185,7 @@ def __init__( config: MixtralConfig, layer_id: int = 0, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -377,8 +198,8 @@ def __init__( num_kv_heads=config.num_key_value_heads, layer_id=layer_id, rope_theta=rope_theta, - sliding_window=config.sliding_window, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) self.block_sparse_moe = MixtralMoE( num_experts=config.num_local_experts, @@ -386,6 +207,7 @@ def __init__( hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, quant_config=quant_config, + prefix=f"{prefix}.block_sparse_moe", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm( @@ -422,6 +244,7 @@ def __init__( self, config: MixtralConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.padding_idx = config.pad_token_id @@ -431,10 +254,11 @@ def __init__( config.vocab_size, config.hidden_size, ) - # config.num_hidden_layers=16 self.layers = nn.ModuleList( [ - MixtralDecoderLayer(config, i, quant_config=quant_config) + MixtralDecoderLayer( + config, i, quant_config=quant_config, prefix=f"{prefix}.layers" + ) for i in range(config.num_hidden_layers) ] ) @@ -462,6 +286,7 @@ def forward( class MixtralForCausalLM(nn.Module): + def __init__( self, config: MixtralConfig, @@ -471,11 +296,10 @@ def __init__( super().__init__() self.config = config self.quant_config = quant_config - self.model = MixtralModel(config, quant_config=quant_config) + self.model = MixtralModel(config, quant_config=quant_config, prefix="model") self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - @torch.no_grad() def forward( self, input_ids: torch.Tensor, @@ -496,40 +320,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] - expert_params_mapping = ( - [ - # These are the weight scales for the experts - # (param_name, weight_name, expert_id) - ( - "w13_scale" if weight_name in ["w1", "w3"] else "w2_scale", - f"experts.{expert_id}.{weight_name}.weight_scale", - expert_id, - ) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] - + [ - # These are the weights for the experts - # (param_name, weight_name, expert_id) - ( - "w13_weight" if weight_name in ["w1", "w3"] else "w2_weight", - f"experts.{expert_id}.{weight_name}.weight", - expert_id, - ) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] - + [ - # These are the activation scales for the experts - # (param_name, weight_name, expert_id) - ( - "a13_scale" if weight_name in ["w1", "w3"] else "a2_scale", - f"experts.{expert_id}.{weight_name}.act_scale", - expert_id, - ) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, ) params_dict = dict(self.named_parameters()) @@ -544,25 +341,35 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: - for param_name, weight_name, expert_id in expert_params_mapping: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue name = name.replace(weight_name, param_name) + param = params_dict[name] weight_loader = param.weight_loader weight_loader( - param, loaded_weight, weight_name, expert_id=expert_id + param, + loaded_weight, + weight_name, + shard_id=shard_id, + expert_id=expert_id, ) break else: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + if name is None: + continue + param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader @@ -570,9 +377,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight) -def all_close_1d(x: torch.Tensor) -> bool: - assert len(x.shape) == 1 - return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) - - EntryClass = MixtralForCausalLM diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index 115fce1d6d..b02e925c5a 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -160,7 +160,6 @@ def __init__( max_position: int = 4096 * 32, rope_theta: float = 10000, quant_config: Optional[QuantizationConfig] = None, - sliding_window: Optional[int] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -183,7 +182,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.sliding_window = sliding_window self.qkv_proj = QKVParallelLinear( hidden_size, @@ -246,7 +244,6 @@ def __init__( num_kv_heads=config.num_key_value_heads, layer_id=layer_id, rope_theta=rope_theta, - sliding_window=config.sliding_window, quant_config=quant_config, ) self.block_sparse_moe = MixtralMoE(config=config, quant_config=quant_config) diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index da223e80b9..48798c5d5f 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -84,7 +84,7 @@ def test_default_without_radix_cache(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] > 950 + assert res["output_throughput"] > 940 def test_default_with_chunked_prefill(self): res = self.run_test( From 0909bb0d2f87e3d6a73a8e0dc0e38f55ce44a4d4 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Tue, 13 Aug 2024 17:01:26 -0700 Subject: [PATCH 017/118] [Feat] Add window attention for gemma-2 (#1056) --- python/sglang/bench_latency.py | 2 +- python/sglang/srt/layers/radix_attention.py | 59 +++-- .../srt/model_executor/forward_batch_info.py | 203 +++++++++++++----- .../sglang/srt/model_executor/model_runner.py | 86 ++++++-- python/sglang/srt/models/gemma2.py | 16 +- python/sglang/srt/server_args.py | 12 ++ python/sglang/test/long_prompt | 1 + python/sglang/test/runners.py | 26 ++- scripts/playground/reference_hf.py | 8 +- test/srt/models/test_embedding_models.py | 10 +- test/srt/models/test_generation_models.py | 22 +- 11 files changed, 319 insertions(+), 126 deletions(-) create mode 100644 python/sglang/test/long_prompt diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index c2b956e1da..ee227849cf 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -64,7 +64,7 @@ class BenchArgs: run_name: str = "before" batch_size: Tuple[int] = (1,) input_len: Tuple[int] = (1024,) - output_len: Tuple[int] = (4,) + output_len: Tuple[int] = (16,) result_filename: str = "" correctness_test: bool = False # This is only used for correctness test diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 1568cf6d96..49b86ad191 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -34,6 +34,7 @@ def __init__( scaling: float, num_kv_heads: int, layer_id: int, + sliding_window_size: int = -1, logit_cap: int = -1, v_head_dim: int = -1, ): @@ -46,6 +47,7 @@ def __init__( self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim self.scaling = scaling self.layer_id = layer_id + self.sliding_window_size = sliding_window_size if ( not global_server_args_dict.get("disable_flashinfer", False) @@ -113,39 +115,51 @@ def decode_forward_triton(self, q, k, v, input_metadata: InputMetadata): return o def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): + # using two wrappers is unnecessary in the current PR, but are prepared for future PRs + prefill_wrapper_ragged = input_metadata.flashinfer_prefill_wrapper_ragged + prefill_wrapper_paged = input_metadata.flashinfer_prefill_wrapper_paged + if self.sliding_window_size != -1: + prefill_wrapper_ragged = prefill_wrapper_ragged[0] + prefill_wrapper_paged = prefill_wrapper_paged[0] + else: + if isinstance(prefill_wrapper_ragged, list): + prefill_wrapper_ragged = prefill_wrapper_ragged[1] + if isinstance(prefill_wrapper_paged, list): + prefill_wrapper_paged = prefill_wrapper_paged[1] + if not input_metadata.flashinfer_use_ragged: self.store_kv_cache(k, v, input_metadata) - o = input_metadata.flashinfer_prefill_wrapper_paged.forward( + o = prefill_wrapper_paged.forward( q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id), causal=True, sm_scale=self.scaling, + window_left=self.sliding_window_size, logits_soft_cap=self.logit_cap, ) else: - o1, s1 = ( - input_metadata.flashinfer_prefill_wrapper_ragged.forward_return_lse( - q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), - k.contiguous().view(-1, self.tp_k_head_num, self.head_dim), - v.contiguous().view(-1, self.tp_v_head_num, self.head_dim), - causal=True, - sm_scale=self.scaling, - logits_soft_cap=self.logit_cap, - ) + o1, s1 = prefill_wrapper_ragged.forward_return_lse( + q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), + k.contiguous().view(-1, self.tp_k_head_num, self.head_dim), + v.contiguous().view(-1, self.tp_v_head_num, self.head_dim), + causal=True, + sm_scale=self.scaling, + window_left=self.sliding_window_size, + logits_soft_cap=self.logit_cap, ) if input_metadata.extend_no_prefix: o = o1 else: - o2, s2 = ( - input_metadata.flashinfer_prefill_wrapper_paged.forward_return_lse( - q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), - input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id), - causal=False, - sm_scale=self.scaling, - logits_soft_cap=self.logit_cap, - ) + # TODO window attention + radix attention will come up in next PR + assert self.sliding_window_size == -1 + o2, s2 = prefill_wrapper_paged.forward_return_lse( + q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), + input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id), + causal=False, + sm_scale=self.scaling, + logits_soft_cap=self.logit_cap, ) o, _ = merge_state(o1, s1, o2, s2) @@ -158,9 +172,16 @@ def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): return o.view(-1, self.tp_q_head_num * self.head_dim) def decode_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): + decode_wrapper = input_metadata.flashinfer_decode_wrapper + if self.sliding_window_size != -1: + decode_wrapper = decode_wrapper[0] + else: + if isinstance(decode_wrapper, list): + decode_wrapper = decode_wrapper[1] + self.store_kv_cache(k, v, input_metadata) - o = input_metadata.flashinfer_decode_wrapper.forward( + o = decode_wrapper.forward( q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id), sm_scale=self.scaling, diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index eb7aaaf2c1..3b2ee9de06 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -16,7 +16,7 @@ """ModelRunner runs the forward passes of the models.""" from dataclasses import dataclass from enum import IntEnum, auto -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING, List, Optional import numpy as np import torch @@ -154,6 +154,7 @@ def from_schedule_batch( model_runner: "ModelRunner", batch: ScheduleBatch, forward_mode: ForwardMode, + sliding_window_size: Optional[int] = None, ): ret = cls( forward_mode=forward_mode, @@ -197,7 +198,7 @@ def from_schedule_batch( ): flashinfer_use_ragged = True ret.init_flashinfer_handlers( - model_runner, prefix_lens, flashinfer_use_ragged + model_runner, prefix_lens, flashinfer_use_ragged, sliding_window_size ) return ret @@ -216,7 +217,11 @@ def init_triton_args(self, batch: ScheduleBatch, prefix_lens): self.triton_max_extend_len = int(torch.max(extend_seq_lens)) def init_flashinfer_handlers( - self, model_runner, prefix_lens, flashinfer_use_ragged + self, + model_runner, + prefix_lens, + flashinfer_use_ragged, + sliding_window_size=None, ): update_flashinfer_indices( self.forward_mode, @@ -225,6 +230,7 @@ def init_flashinfer_handlers( self.seq_lens, prefix_lens, flashinfer_use_ragged=flashinfer_use_ragged, + sliding_window_size=sliding_window_size, ) ( @@ -248,6 +254,7 @@ def update_flashinfer_indices( prefix_lens, flashinfer_decode_wrapper=None, flashinfer_use_ragged=False, + sliding_window_size=None, ): """Init auxiliary variables for FlashInfer attention backend.""" num_qo_heads = model_runner.model_config.num_attention_heads // model_runner.tp_size @@ -255,65 +262,145 @@ def update_flashinfer_indices( head_dim = model_runner.model_config.head_dim batch_size = len(req_pool_indices) - if flashinfer_use_ragged: - paged_kernel_lens = prefix_lens - else: - paged_kernel_lens = seq_lens - - kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda") - kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0) - req_pool_indices_cpu = req_pool_indices.cpu().numpy() - paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy() - kv_indices = torch.cat( - [ - model_runner.req_to_token_pool.req_to_token[ - req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i] - ] - for i in range(batch_size) - ], - dim=0, - ).contiguous() - kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") - - if forward_mode == ForwardMode.DECODE: - # CUDA graph uses different flashinfer_decode_wrapper - if flashinfer_decode_wrapper is None: - flashinfer_decode_wrapper = model_runner.flashinfer_decode_wrapper - - flashinfer_decode_wrapper.end_forward() - flashinfer_decode_wrapper.begin_forward( - kv_indptr, - kv_indices, - kv_last_page_len, - num_qo_heads, - num_kv_heads, - head_dim, - 1, - ) - else: - # extend part - qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda") - qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0) - + if sliding_window_size is None: if flashinfer_use_ragged: - model_runner.flashinfer_prefill_wrapper_ragged.end_forward() - model_runner.flashinfer_prefill_wrapper_ragged.begin_forward( - qo_indptr, + paged_kernel_lens = prefix_lens + else: + paged_kernel_lens = seq_lens + + kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda") + kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0) + req_pool_indices_cpu = req_pool_indices.cpu().numpy() + paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy() + kv_indices = torch.cat( + [ + model_runner.req_to_token_pool.req_to_token[ + req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i] + ] + for i in range(batch_size) + ], + dim=0, + ).contiguous() + kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") + + if forward_mode == ForwardMode.DECODE: + # CUDA graph uses different flashinfer_decode_wrapper + if flashinfer_decode_wrapper is None: + flashinfer_decode_wrapper = model_runner.flashinfer_decode_wrapper + + flashinfer_decode_wrapper.end_forward() + flashinfer_decode_wrapper.begin_forward( + kv_indptr, + kv_indices, + kv_last_page_len, + num_qo_heads, + num_kv_heads, + head_dim, + 1, + ) + else: + # extend part + qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda") + qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0) + + if flashinfer_use_ragged: + model_runner.flashinfer_prefill_wrapper_ragged.end_forward() + model_runner.flashinfer_prefill_wrapper_ragged.begin_forward( + qo_indptr, + qo_indptr, + num_qo_heads, + num_kv_heads, + head_dim, + ) + + # cached part + model_runner.flashinfer_prefill_wrapper_paged.end_forward() + model_runner.flashinfer_prefill_wrapper_paged.begin_forward( qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_len, num_qo_heads, num_kv_heads, head_dim, + 1, ) + else: + kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") + for wrapper_id in range(2): + if flashinfer_use_ragged: + paged_kernel_lens = prefix_lens + else: + paged_kernel_lens = seq_lens - # cached part - model_runner.flashinfer_prefill_wrapper_paged.end_forward() - model_runner.flashinfer_prefill_wrapper_paged.begin_forward( - qo_indptr, - kv_indptr, - kv_indices, - kv_last_page_len, - num_qo_heads, - num_kv_heads, - head_dim, - 1, - ) + if wrapper_id == 0 and forward_mode == ForwardMode.DECODE: + paged_kernel_lens = torch.minimum( + paged_kernel_lens, torch.tensor(sliding_window_size) + ) + kv_start_idx = seq_lens - paged_kernel_lens + else: + kv_start_idx = torch.zeros(batch_size, dtype=torch.int32, device="cuda") + + kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda") + kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0) + req_pool_indices_cpu = req_pool_indices.cpu().numpy() + paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy() + kv_indices = torch.cat( + [ + model_runner.req_to_token_pool.req_to_token[ + req_pool_indices_cpu[i], + kv_start_idx[i] : kv_start_idx[i] + paged_kernel_lens_cpu[i], + ] + for i in range(batch_size) + ], + dim=0, + ).contiguous() + + if forward_mode == ForwardMode.DECODE: + # CUDA graph uses different flashinfer_decode_wrapper + if flashinfer_decode_wrapper is None: + flashinfer_decode_wrapper = model_runner.flashinfer_decode_wrapper + + flashinfer_decode_wrapper[wrapper_id].end_forward() + flashinfer_decode_wrapper[wrapper_id].begin_forward( + kv_indptr, + kv_indices, + kv_last_page_len, + num_qo_heads, + num_kv_heads, + head_dim, + 1, + ) + else: + # extend part + qo_indptr = torch.zeros( + (batch_size + 1,), dtype=torch.int32, device="cuda" + ) + qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0) + + if flashinfer_use_ragged: + model_runner.flashinfer_prefill_wrapper_ragged[ + wrapper_id + ].end_forward() + model_runner.flashinfer_prefill_wrapper_ragged[ + wrapper_id + ].begin_forward( + qo_indptr, + qo_indptr, + num_qo_heads, + num_kv_heads, + head_dim, + ) + + # cached part + model_runner.flashinfer_prefill_wrapper_paged[wrapper_id].end_forward() + model_runner.flashinfer_prefill_wrapper_paged[wrapper_id].begin_forward( + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_len, + num_qo_heads, + num_kv_heads, + head_dim, + 1, + ) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 574ad36580..34a40c7d71 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -295,7 +295,16 @@ def init_cublas(self): return c def init_flashinfer(self): + self.sliding_window_size = ( + self.model.get_window_size() + if hasattr(self.model, "get_window_size") + else None + ) + if self.server_args.disable_flashinfer: + assert ( + self.sliding_window_size is None + ), "turn on flashinfer to support window attention" self.flashinfer_prefill_wrapper_ragged = None self.flashinfer_prefill_wrapper_paged = None self.flashinfer_decode_wrapper = None @@ -309,20 +318,54 @@ def init_flashinfer(self): else: use_tensor_cores = False - self.flashinfer_workspace_buffers = torch.empty( - 2, global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda" - ) - self.flashinfer_prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], "NHD" - ) - self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[1], "NHD" - ) - self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], - "NHD", - use_tensor_cores=use_tensor_cores, - ) + if self.sliding_window_size is None: + self.flashinfer_workspace_buffers = torch.empty( + 2, + global_config.flashinfer_workspace_size, + dtype=torch.uint8, + device="cuda", + ) + self.flashinfer_prefill_wrapper_ragged = ( + BatchPrefillWithRaggedKVCacheWrapper( + self.flashinfer_workspace_buffers[0], "NHD" + ) + ) + self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper( + self.flashinfer_workspace_buffers[1], "NHD" + ) + self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( + self.flashinfer_workspace_buffers[0], + "NHD", + use_tensor_cores=use_tensor_cores, + ) + else: + workspace_buffers = torch.empty( + 4, + global_config.flashinfer_workspace_size, + dtype=torch.uint8, + device="cuda", + ) + self.flashinfer_prefill_wrapper_ragged = [] + self.flashinfer_prefill_wrapper_paged = [] + self.flashinfer_decode_wrapper = [] + for i in range(2): + self.flashinfer_prefill_wrapper_ragged.append( + BatchPrefillWithRaggedKVCacheWrapper( + workspace_buffers[2 * i + 0], "NHD" + ) + ) + self.flashinfer_prefill_wrapper_paged.append( + BatchPrefillWithPagedKVCacheWrapper( + workspace_buffers[2 * i + 1], "NHD" + ) + ) + self.flashinfer_decode_wrapper.append( + BatchDecodeWithPagedKVCacheWrapper( + workspace_buffers[2 * i + 0], + "NHD", + use_tensor_cores=use_tensor_cores, + ) + ) def init_cuda_graphs(self): from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner @@ -358,7 +401,10 @@ def forward_decode(self, batch: ScheduleBatch): return self.cuda_graph_runner.replay(batch) input_metadata = InputMetadata.from_schedule_batch( - self, batch, ForwardMode.DECODE + self, + batch, + ForwardMode.DECODE, + sliding_window_size=self.sliding_window_size, ) return self.model.forward( @@ -368,7 +414,10 @@ def forward_decode(self, batch: ScheduleBatch): @torch.inference_mode() def forward_extend(self, batch: ScheduleBatch): input_metadata = InputMetadata.from_schedule_batch( - self, batch, forward_mode=ForwardMode.EXTEND + self, + batch, + forward_mode=ForwardMode.EXTEND, + sliding_window_size=self.sliding_window_size, ) return self.model.forward( batch.input_ids, input_metadata.positions, input_metadata @@ -377,7 +426,10 @@ def forward_extend(self, batch: ScheduleBatch): @torch.inference_mode() def forward_extend_multi_modal(self, batch: ScheduleBatch): input_metadata = InputMetadata.from_schedule_batch( - self, batch, forward_mode=ForwardMode.EXTEND + self, + batch, + forward_mode=ForwardMode.EXTEND, + sliding_window_size=self.sliding_window_size, ) return self.model.forward( batch.input_ids, diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index db87624d2d..463d5e5054 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -44,6 +44,12 @@ from sglang.srt.model_executor.forward_batch_info import InputMetadata +# Aligned with HF's implementation, using sliding window inclusive with the last token +# SGLang assumes exclusive +def get_window_size(config): + return config.sliding_window - 1 + + class GemmaRMSNorm(CustomOp): """RMS normalization for Gemma. @@ -200,17 +206,14 @@ def __init__( dtype=torch.get_default_dtype(), ) - # from vLLM: FIXME(woosuk): While Gemma 2 uses sliding window attention for every - # odd layer, vLLM currently ignores it and uses global attention for - # all layers. - use_sliding_window = layer_idx % 2 == 1 and config.sliding_window is not None - del use_sliding_window # Unused. + use_sliding_window = layer_idx % 2 == 0 and hasattr(config, "sliding_window") self.attn = RadixAttention( self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, layer_id=layer_idx, + sliding_window_size=get_window_size(config) if use_sliding_window else -1, logit_cap=self.config.attn_logit_softcapping, ) @@ -403,6 +406,9 @@ def forward( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + def get_window_size(self): + return get_window_size(self.config) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 474c80b256..5e7996b801 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -17,9 +17,12 @@ import argparse import dataclasses +import logging import random from typing import List, Optional, Union +logger = logging.getLogger(__name__) + @dataclasses.dataclass class ServerArgs: @@ -446,6 +449,15 @@ def check_server_args(self): assert not ( self.dp_size > 1 and self.node_rank is not None ), "multi-node data parallel is not supported" + if "gemma-2" in self.model_path.lower(): + logger.info( + f"When using sliding window in gemma-2, disable radix_cache, regex_jump_forward, and turn on flashinfer." + ) + self.disable_radix_cache = True + self.disable_regex_jump_forward = True + self.disable_flashinfer = False + self.disable_cuda_graph = True + self.chunked_prefill_size = None @dataclasses.dataclass diff --git a/python/sglang/test/long_prompt b/python/sglang/test/long_prompt new file mode 100644 index 0000000000..301d7e107d --- /dev/null +++ b/python/sglang/test/long_prompt @@ -0,0 +1 @@ +You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\nIntroduction\n\nThroughout U.S. history, Congress has created advisory commissions to assist in the development of public policy. Among other contexts, commissions have been used following crisis situations, including the September 11, 2001, terrorist attacks and the 2008 financial crisis. In such situations, advisory commissions may potentially provide Congress with a high-visibility forum to assemble expertise that might not exist within the legislative environment; allow for the in-depth examination of complex, cross-cutting policy issues; and lend bipartisan credibility to a set of findings and recommendations.\nAs Congress considers its range of responses to the coronavirus pandemic, the creation of one or more congressional advisory commissions is an option that could provide a platform for evaluating various pandemic-related policy issues over time. Past congressional advisory commissions have retrospectively evaluated policy responses, brought together diverse groups of experts, and supplemented existing congressional oversight mechanisms. Policymakers may determine that creating an advisory commission is unnecessary and instead prefer to utilize existing congressional oversight structures, such as standing or select committees, or already established oversight entities.\nThis report provides a comparative analysis of five proposed congressional advisory commissions that would investigate various aspects of the COVID-19 pandemic. The five proposed commissions are found in H.R. 6429 (the National Commission on COVID-19 Act, sponsored by Representative Stephanie Murphy), H.R. 6431 (the Made in America Emergency Preparedness Act, sponsored by Representative Brian Fitzpatrick), H.R. 6440 (the Pandemic Rapid Response Act, sponsored by Representative Rodney Davis), H.R. 6455 (the COVID-19 Commission Act, sponsored by Representative Bennie Thompson), and H.R. 6548 (the National Commission on the COVID-19 Pandemic in the United States Act, sponsored by Representative Adam Schiff). The overall structures of each of the proposed commissions are similar in many respects, both to each other and to previous independent advisory entities established by Congress. Specifically, the proposed commissions would (1) exist temporarily; (2) serve in an advisory capacity; and (3) report a work product detailing the commission\'s findings, conclusions, and recommendations. That said, each particular proposed commission has distinctive elements, particularly concerning its membership structure, appointment structure, and time line for reporting its work product to Congress.\nThis report compares the (1) membership structure, (2) appointment structure, (3) rules of procedure and operation, (4) duties and reporting requirements, (5) powers of the commission, (6) staffing issues, and (7) funding for each of the proposed COVID-19 commissions. Table 1 (at the end of this report) provides a side-by-side comparison of major provisions of the five proposals.\n\n Membership Structure\n\nSeveral matters related to a commission\'s membership structure might be considered. They include the size of a commission, member qualifications, compensation of commission members, and requirements for partisan balance. \n\n Size of Commission\n\nIn general, there is significant variation in the size of congressional advisory commissions. Among 155 identified congressional commissions created between the 101 st Congress and the 115 th Congress, the median size was 12 members, with the smallest commission having 5 members and the largest 33 members.\nThe membership structure of each of the five proposed commissions is similar to previous independent advisory entities created by Congress. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would each create a 10-member entity. H.R. 6455 would create a 25-member entity.\n\n Qualifications\n\nPast legislation creating congressional commissions has often required or suggested that commission members possess certain substantive qualifications. Such provisions arguably make it more likely that the commission is populated with genuine experts in the policy area, which may improve the commission\'s final work product.\nH.R. 6455 would provide that commissioners \"shall be a United States person with significant expertise\" in a variety of fields related to public health and public administration. H.R. 6440 , H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide \"the sense of Congress\" that commission members should be \"prominent U.S. citizens\" who are nationally recognized experts in a variety of fields relevant to the pandemic and response efforts. In addition, H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 all prohibit the appointment of federal, state, and local government employees and officers. H.R. 6455 would prohibit federal employees from being commission members.\n\n Compensation of Commission Members\n\nSome congressional commissions have compensated their members. For example, the National Commission on Terrorist Attacks Upon the United States (9/11 Commission) and the Financial Crisis Inquiry Commission provided that commission members could be compensated at a daily rate of basic pay. Nearly all have reimbursed members for travel expenses. Those that have provided for commissioner compensation most frequently provided compensation at the daily equivalent of level IV of the Executive Schedule.\nEach of the five proposals would provide that commission members be compensated at a rate \"not to exceed the daily equivalent of the annual rate of basic pay\" for level IV of the Executive Schedule, \"for each day during which that member is engaged in the actual performance of duties of the Commission.\" Members of three proposed commissions would receive travel expenses, including a per diem.\n\n Partisan Limitations\n\nEach proposal provides a limit on the number of members appointed from the same political party. H.R. 6455 would provide that not more than 13 of its 25 members may be from the same party. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would provide that not more than 5 (of 10) members are from the same party. Most previous advisory entities created by Congress do not impose formal partisan restrictions on the membership structure. It may also be difficult to assess the political affiliation of potential members, who may have no formal affiliation (voter registration, for example) with a political party. Instead, most past advisory commissions usually achieve partisan balance through the appointment structure; for instance, by providing equal (or near-equal) numbers of appointments to congressional leaders of each party.\n\n Appointment Structure\n\nPast congressional commissions have used a wide variety of appointment structures. Considerations regarding appointment structures include partisan balance, filling vacancies, and the time line for making commission appointments.\nThe statutory scheme may directly designate members of the commission, such as a specific cabinet official or a congressional leader. In other cases, selected congressional leaders, often with balance between the parties, appoint commission members. A third common statutory scheme is to have selected leaders, such as committee chairs and ranking members, recommend candidates for appointment to a commission. These selected leaders may act either in parallel or jointly, and the recommendation may be made either to other congressional leaders, such as the Speaker of the House and President pro tempore of the Senate, or to the President.\nEach of the five commission proposals would delegate most or all appointment authority to congressional leaders (including chamber, party, and committee leaders; see Table 1 for details). Additionally, H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 provide for one appointment to be made by the President. H.R. 6429 , H.R. 6431 , and H.R. 6548 would have the President appoint the commission\'s chair. H.R. 6455 has its membership appointed by the chairs and ranking members of designated House and Senate committees, and the Joint Economic Committee. H.R. 6455 does not provide any executive branch appointments.\nAttention to the proper balance between the number of members appointed by congressional leaders and by other individuals (such as the President), or to the number of Members of Congress required to be among the appointees, or to the qualifications of appointees, can be significant factors in enabling a commission to fulfill its congressional mandate.\nIn general, a commission\'s appointment scheme can impact both the commission\'s ability to fulfill its statutory duties and its final work product. For instance, if the scheme provides only for the appointment of Members of Congress to the commission, it arguably might not have the technical expertise or diversity of knowledge to complete its duties within the time given by statute. Similarly, if the appointment scheme includes qualifying provisos so specific that only a small set of private citizens could serve on the panel, the commission\'s final work product may arguably only represent a narrow range of viewpoints. None of the proposed COVID-19 commissions specify whether Members of Congress may serve on the commission.\n\n Partisan Balance in Appointment Authority\n\nMost previous congressional advisory commissions have been structured to be bipartisan, with an even (or near-even) split of appointments between leaders of the two major parties. By achieving a nonpartisan or bipartisan character, congressional commissions may make their findings and recommendations more politically acceptable to diverse viewpoints. The bipartisan or nonpartisan arrangement can give recommendations strong credibility, both in Congress and among the public, even when dealing with divisive public policy issues. Similarly, commission recommendations that are perceived as partisan may have difficulty gaining support in Congress.\nIn some cases, however, bipartisanship also can arguably impede a commission\'s ability to complete its mandate. In situations where a commission is tasked with studying divisive or partisan issues, the appointment of an equal number of majority and minority commissioners may serve to promote partisanship within the commission rather than suppress it, raising the possibility of deadlock where neither side can muster a majority to act.\nEach of the five proposals employs a structure where leaders in both the majority and minority parties in Congress would make appointments. H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide for five majority and five minority appointments, including one for the President. H.R. 6440 would include two each by the Senate majority leader, the Senate minority leader, and the Speaker of the House, with one appointment by the House minority leader and one by the President, and the chair appointed by the Speaker and vice chair appointed by the Senate majority leader. H.R. 6455 would have 12 majority and 12 minority appointments made by the 12 committee chairs and ranking members and one member jointly appointed by the chair and vice chair of the Joint Economic Committee.\n\n Vacancies\n\nAll five proposals provide that vacancies on the commission will not affect its powers and would be filled in the same manner as the original appointment.\n\n Deadline for Appointments\n\nThree of the bills propose specific deadlines for the appointment of commissioners. H.R. 6429 and H.R. 6548 provide that appointments are made between specific dates in January or February 2021. Further, H.R. 6429 provides that commission members could be appointed in September 2020, if there is no longer a COVID-19 public health emergency in effect—as determined by the Secretary of Health and Human Services—as of August 31, 2020. H.R. 6440 would require all appointments be made by December 15, 2020. H.R. 6455 would require appointments to be made within 45 days after enactment. H.R. 6429 , H.R. 6440 , and H.R. 6548 would start the commission\'s work in early 2021, as the commission cannot operate without the appointment of members. H.R. 6429 , however would provide that the proposed commission\'s work would begin no later than October 31, 2020, if members are appointed in September 2020. H.R. 6431 does not specify a deadline for the appointment of members.\nTypically, deadlines for appointment can range from several weeks to several months. For example, the deadline for appointments to the Antitrust Modernization Commission was 60 days after the enactment of its establishing act. The deadline for appointment to the Commission on Wartime Contracting in Iraq and Afghanistan was 120 days from the date of enactment. The deadline for appointment to the 9/11 Commission was December 15, 2002, 18 days after enactment of the act.\n\n Rules of Procedure and Operations\n\nWhile most statutes that authorize congressional advisory commissions do not provide detailed procedures for how the commission should conduct its business, the statutory language may provide a general structure, including a mechanism for selecting a chair and procedures for creating rules. None of the five COVID-19 commission proposals contain language that directs the process for potentially adopting rules of procedure. For a comparison of each proposed commission\'s specified rules of procedures and operations, see Table 1 .\n\n Chair Selection\n\nEach bill provides for the selection of a chair and/or vice chair of the commission. H.R. 6429 , H.R. 6431 , and H.R. 6548 would have the chair appointed by the President and the vice chair appointed by congressional leaders of the political party opposite the President. H.R. 6440 would have the chair appointed by the Speaker of the House (in consultation with the Senate majority leader and the House minority leader) and the vice chair appointed by the Senate majority leader (in consultation with the Speaker of the House and the Senate minority leader). H.R. 6455 would have the chair and vice chair chosen from among commission members by a majority vote of the commission, and would require the chair and vice chair to have \"significant experience\" in areas to be studied by the commission.\n\n Initial Meeting Deadline\n\nAs with the timing of commission appointments, some authorizing statutes are prescriptive in when the commission\'s first meeting should take place. Three of the bills analyzed here provide specific time lines for the commission\'s first meeting. H.R. 6429 would require the first meeting to be no later than March 15, 2021, unless members are appointed in September 2020 (if no public health emergency exists). H.R. 6455 would require the first meeting within 45 days after the appointment of all commission members, which is—given the 45-day deadline for appointment—effectively a maximum of 90 days after enactment. H.R. 6548 would direct the commission to hold its initial meeting \"as soon as practicable,\" but not later than March 5, 2021. H.R. 6431 and H.R. 6440 do not provide for an initial meeting deadline. Instead, they direct the commission to meet \"as soon as practicable.\" \n\n Quorum\n\nMost commission statutes provide that a quorum will consist of a particular number of commissioners, usually a majority, but occasionally a supermajority. All five bills would provide for a quorum requirement. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would define a quorum as 6 (of 10) members. H.R. 6455 would provide that a quorum is 18 of 25 members (72%).\n\n Public Access\n\nAll five commission bills would require commission meetings to be open to the public. Each bill would also require that reports be made publicly available.\n\n Formulating Other Rules of Procedure and Operations\n\nAbsent statutory guidance (eithe r in general statutes or in individual statutes authorizing commissions), advisory entities vary widely in how they adopt their rules of procedure. In general, three models exist: formal written rules, informal rules, and the reliance on norms. Any individual advisory entity might make use of all three of these models for different types of decisionmaking. \nThe choice to adopt written rules or rely on informal norms to guide commission procedure may be based on a variety of factors, such as the entity\'s size, the frequency of meetings, member preferences regarding formality, the level of collegiality among members, and the amount of procedural guidance provided by the entity\'s authorizing statute. Regardless of how procedural issues are handled, protocol for decisionmaking regarding the following operational issues may be important for the commission to consider at the outset of its existence: eligibility to vote and proxy rules; staff hiring, compensation, and work assignments; hearings, meetings, and field visits; nonstaff expenditures and contracting; reports to Congress; budgeting; and procedures for future modification of rules. None of the five COVID-19 commission proposals specify that the proposed commission must adopt written rules.\n\n FACA Applicability\n\nThe Federal Advisory Committee Act (FACA) mandates certain structural and operational requirements, including formal reporting and oversight procedures, for certain federal advisory bodies that advise the executive branch. Three proposals ( H.R. 6429 , H.R. 6431 , and H.R. 6548 ) specifically exempt the proposed commission from FACA. Of the remaining two, FACA would also likely not apply to the commission proposed in H.R. 6455 because it would be appointed entirely by Members of Congress, although it only specifies that its final report is public, not whether it is specifically sent to Congress and/or the President. It is not clear that FACA would apply to the commission proposed in H.R. 6440 . Although it includes a presidential appointment and its report would be sent to both Congress and the President, its establishment clause specifies that the commission \"is established in the legislative branch,\" and a super-majority of its members would be appointed by Congress.\n\n Duties and Reporting Requirements\n\nMost congressional commissions are generally considered policy commissions—temporary bodies that study particular policy problems and report their findings to Congress or review a specific event. \n\n General Duties\n\nAll five of the proposed commissions would be tasked with duties that are analogous to those of past policy commissions. While the specific mandates differ somewhat, all proposed commissions are tasked with investigating aspects of the COVID-19 pandemic and submitting one or more reports that include the commission\'s findings, conclusions, and recommendations for legislative action. H.R. 6440 would specifically require the commission to avoid unnecessary duplication of work being conducted by the Government Accountability Office (GAO), congressional committees, and executive branch agency and independent commission investigations.\n\n Reports\n\nEach proposed commission would be tasked with issuing a final report detailing its findings, conclusions, and recommendations. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would provide that the commission \"may submit\" interim reports to Congress and the President, but do not provide time lines on when those reports might be submitted. In each case, the interim report would need to be agreed to by a majority of commission members. H.R. 6431 would also require the commission to submit a report on actions taken by the states and a report on essential products, materials, ingredients, and equipment required to fight pandemics.\nH.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 also specify that final reports shall be agreed to by a majority of commission members. H.R. 6455 does not specify a vote threshold for approval of its report.\nNone of the bills make specific provisions for the inclusion of minority viewpoints. Presumably this would leave each commission with discretion on whether to include or exclude minority viewpoints. Past advisory entities have been proposed or established with a variety of statutory reporting conditions, including the specification of majority or super-majority rules for report adoption and provisions requiring the inclusion of minority viewpoints. In practice, advisory bodies that are not given statutory direction on these matters have tended to work under simple-majority rules for report adoption.\n\n Report Deadlines\n\nH.R. 6429 would require a final report one year after the commission\'s initial meeting. H.R. 6431 and H.R. 6440 would require a final report not later than 18 months after enactment. H.R. 6455 would require a final report to be published not later than 18 months after the commission\'s first meeting. \nH.R. 6548 would require a final report by October 15, 2021. This deadline could be extended by 90 days upon a vote of no fewer than 8 (out of 10) commission members. The commission could vote to extend its final report deadline up to three times, and would be required to notify Congress, the President, and the public of any such extension.\nWhile such a deadline would potentially give the commission a defined period of time to complete its work, setting a particular date for report completion could potentially create unintended time constraints. Any delay in the passage of the legislation or in the appointment process would reduce the amount of time the commission has to complete its work, even with the opportunity for the commission to extend its own deadline up to three times.\nThe length of time a congressional commission has to complete its work is arguably one of the most consequential decisions when designing an advisory entity. If the entity has a short window of time, the quality of its work product may suffer or it may not be able to fulfill its statutory mandate on time.\nOn the other hand, if the commission is given a long period of time to complete its work, it may undermine one of a commission\'s primary legislative advantages, the timely production of expert advice on a current matter. A short deadline may also affect the process of standing up a new commission. The selection of commissioners, recruitment of staff, arrangement of office space, and other logistical matters may require expedited action if short deadlines need to be met.\n\n Report Submission\n\nOf the five proposed commissions, four ( H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 ) are directed to submit their reports to both Congress and the President. H.R. 6455 requires that the report is made public.\nMost congressional advisory commissions are required to submit their reports to Congress, and sometimes to the President or an executive department or agency head. For example, the National Commission on Severely Distressed Public Housing\'s final report was submitted to both Congress and the Secretary of Housing and Urban Development.\n\n Commission Termination\n\nCongressional commissions are usually statutorily mandated to terminate. Termination dates for most commissions are linked to either a fixed period of time after the establishment of the commission, the selection of members, or the date of submission of the commission\'s final report. Alternatively, some commissions are given fixed calendar termination dates.\nAll five commission proposals would provide for the commission to terminate within a certain period of time following submission of its final report. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6455 would each direct the commission to terminate 60 days after the submission; H.R. 6548 specifies a time line of 90 days after submission.\n\n Commission Powers\n\nEach of the five proposals would provide the proposed commission with certain powers to carry out its mission (see Table 1 for specifics). One general issue for commissions is who is authorized to execute such powers. In some cases, the commission itself executes its powers, with the commission deciding whether to devise rules and procedures for the general use of such power. In other cases, the legislation specifically authorizes the commission to give discretionary power to subcommittees or individual commission members. Finally, the legislation itself might grant certain powers to individual members of the commission, such as the chair.\n\n Hearings and Evidence\n\nAll five bills would provide the proposed commission with the power to hold hearings, take testimony, and receive evidence. All five commissions would also be provided the power to administer oaths to witnesses.\n\n Subpoenas\n\nFour of the bills would provide the commission with subpoena power. H.R. 6440 would not provide subpoena power to the commission. H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide that subpoenas could only be issued by either (1) agreement of the chair and vice chair, or (2) the affirmative vote of 6 (of 10) commission members. H.R. 6455 would require that a subpoena could only be issued by either agreement of the chair and vice chair or an affirmative vote of 18 (of 25) commission members. All four bills that would provide subpoena power contain substantially similar judicial methods of subpoena enforcement.\n\n Administrative Support\n\nAll five of the bills would provide that the commission receive administrative support from the General Services Administration (GSA). The GSA provides administrative support to dozens of federal entities, including congressional advisory commissions. Each of the five bills would provide that GSA be reimbursed for its services by the commission. Each bill also provides that other departments or agencies may provide funds, facilities, staff, and other services to the commission.\n\n Other Powers\n\nWithout explicit language authorizing certain activities, commissions often cannot gather information, enter into contracts, use the U.S. mail like an executive branch entity, or accept donations or gifts. \nAll five bills direct that federal agencies provide information to the commission upon request. H.R. 6429 , H.R. 6431 , and H.R. 6548 would also provide that the commission could use the U.S. mails in the same manner as any department or agency, enter into contracts, and accept gifts or donations of services or property.\n\n Staffing\n\nThe proposed COVID-19 commissions contain staffing provisions commonly found in congressional advisory commission legislation. Congressional advisory commissions are usually authorized to hire staff. Most statutes specify that the commission may hire a lead staffer, often referred to as a \"staff director,\" \"executive director,\" or another similar title, in addition to additional staff as needed. Rather than mandate a specific staff size, many commissions are instead authorized to appoint a staff director and other personnel as necessary, subject to the limitations of available funds.\nMost congressional commissions are also authorized to hire consultants, procure intermittent services, and request that federal agencies detail personnel to aid the work of the commission.\n\n Director and Commission Staff\n\nFour of the bills provide that the commission may hire staff without regard to certain laws regarding the competitive service; H.R. 6440 does not specifically exempt the commission from such laws. Four bills ( H.R. 6429 , H.R. 6431 , H.R. 6455 , and H.R. 6548 ) would authorize, but not require, the commission to hire a staff director and additional staff, as appropriate. Four proposals would limit staff salaries to level V of the executive schedule. Three of the bills would specifically designate staff as federal employees for the purposes of certain laws, such as workman\'s compensation, retirement, and other benefits.\n\n Detailees\n\nWhen authorized, some commissions can have federal agency staff detailed to the commission. All five bills would provide that federal employees could be detailed to the commission. Four bills would provide that the detailee would be without reimbursement to his or her home agency. H.R. 6440 would allow detailees on a reimbursable basis. \n\n Experts and Consultants\n\nAll five bills would provide the commission with the authority to hire experts and consultants. Four of the bills limit the rate of pay for consultants to level IV of the Executive Schedule. H.R. 6440 does not specify a specific limit.\n\n Security Clearances\n\nFour bills would provide that federal agencies and departments shall cooperate with the commission to provide members and staff appropriate security clearances. H.R. 6440 does not contain a security clearance provision.\n\n Funding and Costs\n\nCommissions generally require funding to help meet their statutory goals. When designing a commission, therefore, policymakers may consider both how the commission will be funded, and how much funding the commission will be authorized to receive. Four of the five proposals specify a funding mechanism for the commission.\nHow commissions are funded and the amounts that they receive vary considerably. Several factors can contribute to overall commission costs. These factors might include the cost of hiring staff, contracting with outside consultants, and engaging administrative support, among others. Additionally, most commissions reimburse the travel expenditures of commissioners and staff, and some compensate their members. The duration of a commission can also significantly affect its cost; past congressional commissions have been designed to last anywhere from several months to several years.\n\n Costs\n\nIt is difficult to estimate or predict the potential overall cost of any commission. Annual budgets for congressional advisory entities range from several hundred thousand dollars to millions of dollars annually. Overall expenses for any individual advisory entity depend on a variety of factors, the most important of which are the number of paid staff and the commission\'s duration and scope. Some commissions have few full-time staff; others employ large numbers, such as the National Commission on Terrorist Attacks Upon the United States, which had a full-time paid staff of nearly 80. Secondary factors that can affect commission costs include the number of commissioners, how often the commission meets or holds hearings, whether or not the commission travels or holds field hearings, and the publications the commission produces.\n\n Authorized Funding\n\nThree of the bills ( H.R. 6429 , H.R. 6440 , and H.R. 6548 ) would authorize the appropriation of \"such sums as may be necessary\" for the commission, to be derived in equal amounts from the contingent fund of the Senate and the applicable accounts of the House of Representatives. H.R. 6429 and H.R. 6548 would provide that funds are available until the commission terminates. H.R. 6455 would authorize the appropriation of $4 million for the commission, to remain available until the commission terminates. H.R. 6431 does not include an authorization of appropriations.\n\n Comparison of Proposals to Create a COVID-19 Commission\n\n Table 1 provides a side-by-side comparison of major provisions of the five proposals. For each bill, the membership structure, appointment structure, rules of procedure and operation, duties and reporting requirements, proposed commission powers, staffing provisions, and funding are compared.\n\nSummary:\n \ No newline at end of file diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index fadd56e8c2..c8357a16c6 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -15,6 +15,7 @@ import json import multiprocessing +import os from dataclasses import dataclass from typing import List, Union @@ -31,8 +32,14 @@ "The capital of the United Kindom is", "Today is a sunny day and I like", "AI is a field of computer science focused on", + "Apple is red. Banana is Yellow. " * 800 + "Apple is", ] +dirpath = os.path.dirname(__file__) +with open(os.path.join(dirpath, "long_prompt"), "r") as f: + long_prompt = f.read() +DEFAULT_PROMPTS.append(long_prompt) + NUM_TOP_LOGPROBS = 5 @@ -125,16 +132,14 @@ def start_model_process( ) logits = self.model.forward(input_ids).logits[0] - logprobs = F.log_softmax( - logits, dim=-1, dtype=torch.float32 - ).tolist() - # index_of_max = (lambda nums: nums.index(max(nums)))(logprobs[-1]) - # print("index", index_of_max) - logprobs = [ - sorted(token_logprobs, reverse=True)[:NUM_TOP_LOGPROBS] - for token_logprobs in logprobs - ] - prefill_logprobs.append(logprobs) + logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) + logprobs, top_indices = torch.topk( + logprobs, k=NUM_TOP_LOGPROBS, dim=-1 + ) + # print("index", top_indices) + prefill_logprobs.append(logprobs.tolist()) + del logits + del logprobs out_queue.put( ModelOutput( @@ -186,6 +191,7 @@ def __init__( tp_size=tp_size, dtype=get_dtype_str(torch_dtype), port=port, + mem_fraction_static=0.7, ) def forward( diff --git a/scripts/playground/reference_hf.py b/scripts/playground/reference_hf.py index ac91b3bed4..d2d3116101 100644 --- a/scripts/playground/reference_hf.py +++ b/scripts/playground/reference_hf.py @@ -35,18 +35,17 @@ def normal_text(args): args.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, + device_map="auto", trust_remote_code=True, ) m.cuda() - print(m) - prompts = [ "The capital of France is", "The capital of the United Kindom is", "Today is a sunny day and I like", ] - max_new_tokens = 32 + max_new_tokens = 16 for p in prompts: if isinstance(p, str): @@ -58,10 +57,11 @@ def normal_text(args): input_ids, do_sample=False, max_new_tokens=max_new_tokens ) output_str = t.decode(output_ids[0]) - print(output_str) prefill_logits = m.forward(input_ids).logits[0][-1] + print("prefill logits", prefill_logits) + print(output_str) @torch.inference_mode() diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py index 520e811a80..67e47d90d3 100644 --- a/test/srt/models/test_embedding_models.py +++ b/test/srt/models/test_embedding_models.py @@ -53,11 +53,13 @@ def assert_close_prefill_logits( srt_logits = torch.Tensor(srt_outputs.embed_logits[i]) similarities = torch.tensor(get_similarities(hf_logits, srt_logits)) + print("max similarity diff", torch.max(abs(similarities - 1))) - tolerance = 1e-2 - assert torch.all( - abs(similarities - 1) < tolerance - ), f"embeddings not all close" + if hf_logits.shape[0] <= 100: + tolerance = 1e-2 + assert torch.all( + abs(similarities - 1) < tolerance + ), f"embeddings not all close" def test_prefill_logits(self): for model, tp_size in MODELS: diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index ca4f096e30..bb56ebdad7 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -20,8 +20,8 @@ from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner MODELS = [ - ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1), - ("google/gemma-2-2b", 1), + ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1, 1.1), + ("google/gemma-2-2b", 1, 3), ] TORCH_DTYPES = [torch.float16] @@ -35,6 +35,7 @@ def assert_close_prefill_logits_and_output_strs( tp_size, torch_dtype, max_new_tokens, + long_context_tolerance, ) -> None: with HFRunner( model_path, torch_dtype=torch_dtype, is_generation_model=True @@ -53,15 +54,19 @@ def assert_close_prefill_logits_and_output_strs( hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i]) srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i]) - tolerance = 3e-2 - assert torch.all( - abs(hf_logprobs - srt_logprobs) < tolerance - ), f"prefill logprobs not all close" + print("max_diff", torch.max(abs(hf_logprobs - srt_logprobs))) + if hf_logprobs.shape[0] <= 100: + tolerance = 3e-2 + assert torch.all( + abs(hf_logprobs - srt_logprobs) < tolerance + ), f"prefill logprobs not all close" + print(hf_outputs.output_strs) + print(srt_outputs.output_strs) assert hf_outputs.output_strs == srt_outputs.output_strs - def test_prefill_logits(self): - for model, tp_size in MODELS: + def test_prefill_logits_and_output_strs(self): + for model, tp_size, long_context_tolerance in MODELS: for torch_dtype in TORCH_DTYPES: max_new_tokens = 8 self.assert_close_prefill_logits_and_output_strs( @@ -70,6 +75,7 @@ def test_prefill_logits(self): tp_size, torch_dtype, max_new_tokens, + long_context_tolerance=long_context_tolerance, ) From e205527cb11148b19ba4061d8503e7866c3f25dd Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Tue, 13 Aug 2024 21:14:05 -0700 Subject: [PATCH 018/118] Fix jump forward final state circular path bug. (#1084) --- python/sglang/srt/constrained/jump_forward.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/constrained/jump_forward.py b/python/sglang/srt/constrained/jump_forward.py index 7b694318e4..b00c48d478 100644 --- a/python/sglang/srt/constrained/jump_forward.py +++ b/python/sglang/srt/constrained/jump_forward.py @@ -62,16 +62,22 @@ def _init_state_to_jump_forward(regex_string): id_to_symbol.setdefault(id_, []).append(symbol) transitions = fsm_info.transitions + outgoings_ct = defaultdict(int) - state_to_jump_forward = {} + # NOTE(lsyin): Final states can lead to terminate, so they have one outgoing edge naturally + for s in fsm_info.finals: + outgoings_ct[s] = 1 + state_to_jump_forward = {} for (state, id_), next_state in transitions.items(): if id_ == fsm_info.alphabet_anything_value: + # Arbitrarily symbol cannot be recognized as jump forward continue + symbols = id_to_symbol[id_] for c in symbols: if len(c) > 1: - # Skip byte level transitions + # Skip byte level transitions like c = "5E" continue outgoings_ct[state] += 1 @@ -87,6 +93,9 @@ def _init_state_to_jump_forward(regex_string): # Process the byte level jump forward outgoings_ct = defaultdict(int) + for s in fsm_info.finals: + outgoings_ct[s] = 1 + for (state, id_), next_state in transitions.items(): if id_ == fsm_info.alphabet_anything_value: continue @@ -177,3 +186,5 @@ def test_main(regex_string): test_main(r"霍格沃茨特快列车|霍比特人比尔博") # 霍格: \xe9\x9c\x8d \xe6\xa0\xbc ... # 霍比: \xe9\x9c\x8d \xe6\xaf\x94 ... + + test_main(r"[-+]?[0-9]+[ ]*") From c8423ca3112f6bf638f294a548e16ab4a3e79f1f Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 14 Aug 2024 15:27:35 +0800 Subject: [PATCH 019/118] ci: update timeout and retry (#1086) Co-authored-by: Liangsheng Yin --- .github/workflows/accuracy-test.yml | 4 +++- .github/workflows/e2e-test.yml | 5 +++++ .github/workflows/moe-test.yml | 15 +++++++++++---- .github/workflows/unit-test.yml | 4 ++++ test/srt/test_moe_serving_throughput.py | 2 +- 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 16bb584f4a..da2d98e861 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -6,11 +6,13 @@ on: paths: - "python/sglang/**" - "test/**" + - ".github/workflows/accuracy-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" + - ".github/workflows/accuracy-test.yml" workflow_dispatch: concurrency: @@ -43,4 +45,4 @@ jobs: run: | cd test/srt python3 test_eval_accuracy_large.py - timeout-minutes: 20 + timeout-minutes: 10 diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 455594bd72..3a338a6577 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -6,11 +6,13 @@ on: paths: - "python/sglang/**" - "test/**" + - ".github/workflows/e2e-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" + - ".github/workflows/e2e-test.yml" workflow_dispatch: concurrency: @@ -39,13 +41,16 @@ jobs: run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default + timeout-minutes: 10 - name: Benchmark Serving Throughput (w/o RadixAttention) run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache + timeout-minutes: 10 - name: Benchmark Serving Throughput (w/ ChunkedPrefill) run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill + timeout-minutes: 10 diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index a781f2eff8..39eb2a71dd 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -6,11 +6,13 @@ on: paths: - "python/sglang/**" - "test/**" + - ".github/workflows/moe-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" + - ".github/workflows/moe-test.yml" workflow_dispatch: concurrency: @@ -36,7 +38,12 @@ jobs: pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Benchmark MOE Serving Throughput - run: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache + uses: nick-fields/retry@v3 + with: + timeout_minutes: 15 + max_attempts: 2 + retry_on: error + command: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index f9b79dc674..59228585fe 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -6,11 +6,13 @@ on: paths: - "python/sglang/**" - "test/**" + - ".github/workflows/unit-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" + - ".github/workflows/unit-test.yml" workflow_dispatch: concurrency: @@ -41,8 +43,10 @@ jobs: run: | cd test/srt python3 run_suite.py --suite minimal + timeout-minutes: 15 - name: Test Frontend Language run: | cd test/lang python3 run_suite.py --suite minimal + timeout-minutes: 10 diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 48798c5d5f..713eba7abb 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -73,7 +73,7 @@ def test_default(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] > 950 + assert res["output_throughput"] > 930 def test_default_without_radix_cache(self): res = self.run_test( From 616b59f384ad13b824fa8bb634444b43967f8c8a Mon Sep 17 00:00:00 2001 From: rainred <107027757+gryffindor-rr@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:28:04 +0800 Subject: [PATCH 020/118] [Feature] modify Runtime to support skip_tokenizer_init (#1088) Co-authored-by: lzhang --- python/sglang/srt/server.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 7331425fae..8f735ac0c7 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -533,11 +533,18 @@ async def async_generate( prompt: str, sampling_params: Optional[Dict] = None, ): - json_data = { - "text": prompt, - "sampling_params": sampling_params, - "stream": True, - } + if self.server_args.skip_tokenizer_init: + json_data = { + "input_ids": prompt, + "sampling_params": sampling_params, + "stream": True, + } + else: + json_data = { + "text": prompt, + "sampling_params": sampling_params, + "stream": True, + } pos = 0 timeout = aiohttp.ClientTimeout(total=3 * 3600) @@ -549,10 +556,13 @@ async def async_generate( if chunk == "data: [DONE]\n\n": break data = json.loads(chunk[5:].strip("\n")) - cur = data["text"][pos:] - if cur: - yield cur - pos += len(cur) + if hasattr(data, "text"): + cur = data["text"][pos:] + if cur: + yield cur + pos += len(cur) + else: + yield data add_request = async_generate From 8f790ac1005cfb5403a0a1e847bb0e050a4282da Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 03:25:38 -0700 Subject: [PATCH 021/118] Fix a bug in cuda graph runner (#1094) --- python/sglang/srt/model_executor/cuda_graph_runner.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 9bfd4a646c..a74e8eef78 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -98,8 +98,8 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): self.req_pool_indices = torch.zeros( (self.max_bs,), dtype=torch.int32, device="cuda" ) - self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32, device="cuda") - self.position_ids_offsets = torch.zeros( + self.seq_lens = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda") + self.position_ids_offsets = torch.ones( (self.max_bs,), dtype=torch.int32, device="cuda" ) self.out_cache_loc = torch.zeros( @@ -201,7 +201,7 @@ def run_once(): out_cache_loc=out_cache_loc, return_logprob=False, top_logprobs_nums=0, - positions=(seq_lens - 1).to(torch.int64), + positions=(seq_lens - 1 + position_ids_offsets).to(torch.int64), flashinfer_decode_wrapper=flashinfer_decode_wrapper, ) @@ -225,8 +225,8 @@ def replay(self, batch: ScheduleBatch): index = bisect.bisect_left(self.batch_size_list, raw_bs) bs = self.batch_size_list[index] if bs != raw_bs: - self.seq_lens.fill_(1) - self.position_ids_offsets.zero_() + self.seq_lens.zero_() + self.position_ids_offsets.fill_(1) self.out_cache_loc.zero_() # Common inputs From f14569f64aa19bcdbf51e08d0aba7e19ccfb5b88 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 14 Aug 2024 18:36:24 +0800 Subject: [PATCH 022/118] ci: remove workflow path trigger (#1096) --- .github/workflows/accuracy-test.yml | 2 -- .github/workflows/e2e-test.yml | 2 -- .github/workflows/moe-test.yml | 2 -- .github/workflows/unit-test.yml | 2 -- 4 files changed, 8 deletions(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index da2d98e861..374f0d2856 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -6,13 +6,11 @@ on: paths: - "python/sglang/**" - "test/**" - - ".github/workflows/accuracy-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" - - ".github/workflows/accuracy-test.yml" workflow_dispatch: concurrency: diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 3a338a6577..cb11e0db53 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -6,13 +6,11 @@ on: paths: - "python/sglang/**" - "test/**" - - ".github/workflows/e2e-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" - - ".github/workflows/e2e-test.yml" workflow_dispatch: concurrency: diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index 39eb2a71dd..51f7d02261 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -6,13 +6,11 @@ on: paths: - "python/sglang/**" - "test/**" - - ".github/workflows/moe-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" - - ".github/workflows/moe-test.yml" workflow_dispatch: concurrency: diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 59228585fe..4b61c4c4ed 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -6,13 +6,11 @@ on: paths: - "python/sglang/**" - "test/**" - - ".github/workflows/unit-test.yml" pull_request: branches: [ main ] paths: - "python/sglang/**" - "test/**" - - ".github/workflows/unit-test.yml" workflow_dispatch: concurrency: From fe5024325b8bf952714a49575c86e9b608d01f58 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 14 Aug 2024 19:40:05 +0800 Subject: [PATCH 023/118] docs: update README (#1098) --- .github/ISSUE_TEMPLATE/1-bug-report.yml | 3 ++- .github/ISSUE_TEMPLATE/2-feature-request.yml | 6 ++++++ .github/pull_request_template.md | 7 ++++--- README.md | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml index c1684c14bb..5f6734867c 100644 --- a/.github/ISSUE_TEMPLATE/1-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -12,6 +12,7 @@ body: - label: 2. The bug has not been fixed in the latest version. - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback. - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed. + - label: 5. Please use English, otherwise it will be closed. - type: textarea attributes: label: Describe the bug @@ -31,7 +32,7 @@ body: attributes: label: Environment description: | - Please provide necessary environment information here with `python3 -m sglang.check_env`. + Please provide necessary environment information here with `python3 -m sglang.check_env`. Otherwise the issue will be closed. placeholder: Environment here. validations: required: true diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml index 5ab369f8b0..31bc4a127e 100644 --- a/.github/ISSUE_TEMPLATE/2-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml @@ -3,6 +3,12 @@ description: Suggest an idea for this project title: "[Feature] " body: +- type: checkboxes + attributes: + label: Checklist + options: + - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed. + - label: 2. Please use English, otherwise it will be closed. - type: textarea attributes: label: Motivation diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 20f4a10bc5..acc9682d64 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,6 +10,7 @@ Briefly describe the changes made in this PR. ## Checklist -1. Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues. -2. Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness. -3. Modify documentation as needed, such as docstrings or example tutorials. +-[] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**. +-[] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues. +-[] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness. +-[] Modify documentation as needed, such as docstrings or example tutorials. diff --git a/README.md b/README.md index 117c329bb0..451e0a6934 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ docker run --gpus all \ 2. Execute the command `docker compose up -d` in your terminal. ### Common Notes -- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server. +- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue. - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. ## Backend: SGLang Runtime (SRT) From a59636bb5e68f36308bb092674429d27c05cf125 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 04:40:44 -0700 Subject: [PATCH 024/118] Update grok 1 model (#1095) --- benchmark/gsm8k/bench_sglang.py | 3 + python/sglang/bench_latency.py | 1 + python/sglang/srt/layers/activation.py | 1 - .../sglang/srt/layers/fused_moe/__init__.py | 1 + .../srt/layers/{ => fused_moe}/fused_moe.py | 273 ++++---- python/sglang/srt/layers/fused_moe/layer.py | 587 ++++++++++++++++++ python/sglang/srt/layers/logits_processor.py | 8 +- .../sglang/srt/model_executor/model_runner.py | 4 +- python/sglang/srt/models/grok.py | 444 ++----------- python/sglang/srt/models/mixtral.py | 1 - python/sglang/srt/utils.py | 3 +- 11 files changed, 813 insertions(+), 513 deletions(-) create mode 100644 python/sglang/srt/layers/fused_moe/__init__.py rename python/sglang/srt/layers/{ => fused_moe}/fused_moe.py (78%) create mode 100644 python/sglang/srt/layers/fused_moe/layer.py diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index 298ec11d73..652086f913 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -88,6 +88,9 @@ def few_shot_gsm8k(s, question): for i in range(len(states)): preds.append(get_answer_value(states[i]["answer"])) + # print(f"{preds=}") + # print(f"{labels=}") + # Compute accuracy acc = np.mean(np.array(preds) == np.array(labels)) invalid = np.mean(np.array(preds) == INVALID) diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index ee227849cf..e500d30d1c 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -221,6 +221,7 @@ def correctness_test( # Prepare inputs input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer) + rank_print(f"{input_ids=}") if bench_args.cut_len > 0: # Prefill diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 64d3915946..7cd8abb6f9 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -14,7 +14,6 @@ """Fused operators for activation layers.""" import torch -import torch.nn as nn import torch.nn.functional as F from flashinfer.activation import silu_and_mul from vllm.model_executor.custom_op import CustomOp diff --git a/python/sglang/srt/layers/fused_moe/__init__.py b/python/sglang/srt/layers/fused_moe/__init__.py new file mode 100644 index 0000000000..5f7691c09f --- /dev/null +++ b/python/sglang/srt/layers/fused_moe/__init__.py @@ -0,0 +1 @@ +from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase diff --git a/python/sglang/srt/layers/fused_moe.py b/python/sglang/srt/layers/fused_moe/fused_moe.py similarity index 78% rename from python/sglang/srt/layers/fused_moe.py rename to python/sglang/srt/layers/fused_moe/fused_moe.py index c5630fa5db..717be5ce96 100644 --- a/python/sglang/srt/layers/fused_moe.py +++ b/python/sglang/srt/layers/fused_moe/fused_moe.py @@ -1,20 +1,5 @@ -""" -Copyright 2023-2024 SGLang Team -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - # Adapted from -# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/layers/fused_moe/fused_moe.py#L1 +# https://github.com/vllm-project/vllm/tree/v0.5.4/vllm/model_executor/layers/fused_moe """Fused MoE kernel.""" import functools import json @@ -24,6 +9,7 @@ import torch import triton import triton.language as tl +import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger @@ -373,6 +359,31 @@ def get_default_config( return config +def try_get_optimal_moe_config( + w1_shape: Tuple[int, ...], + w2_shape: Tuple[int, ...], + top_k: int, + dtype: Optional[str], + M: int, + override_config: Optional[Dict[str, Any]] = None, +): + if override_config: + config = override_config + else: + # First try to load optimal config from the file + E, _, N = w2_shape + configs = get_moe_configs(E, N, dtype) + + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Else use the default config + config = get_default_config(M, E, N, w1_shape[2], top_k, dtype) + return config + + def fused_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, @@ -403,6 +414,41 @@ def fused_topk( return topk_weights, topk_ids +# This is used by the Deepseek-V2 model +def grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, +): + + assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" + + scores = torch.softmax(gating_output, dim=-1) + num_token = scores.shape[0] + group_scores = ( + scores.view(num_token, num_expert_group, -1).max(dim=-1).values + ) # [n, n_group] + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group) + .reshape(num_token, -1) + ) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + return topk_weights, topk_ids + + def fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -425,24 +471,23 @@ def fused_experts( assert w2.is_contiguous(), "Expert weights2 must be contiguous" assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16] - M, _ = hidden_states.shape + num_tokens, _ = hidden_states.shape E, N, _ = w1.shape + # We execute the fused_moe kernel in chunks to circumvent this issue: + # https://github.com/vllm-project/vllm/issues/5938 + CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE + M = min(num_tokens, CHUNK_SIZE) + + get_config_func = functools.partial( + try_get_optimal_moe_config, + w1.shape, + w2.shape, + topk_ids.shape[1], + "float8" if use_fp8 else None, + override_config=override_config, + ) - if override_config: - config = override_config - else: - # First try to load optimal config from the file - configs = get_moe_configs(E, w2.shape[2], "float8" if use_fp8 else None) - - if configs: - # If an optimal configuration map has been found, look up the - # optimal config - config = configs[min(configs.keys(), key=lambda x: abs(x - M))] - else: - # Else use the default config - config = get_default_config( - M, E, N, w1.shape[2], topk_ids.shape[1], "float8" if use_fp8 else None - ) + config = get_config_func(M) intermediate_cache1 = torch.empty( (M, topk_ids.shape[1], N), @@ -460,56 +505,85 @@ def fused_experts( dtype=hidden_states.dtype, ) - sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( - topk_ids, config["BLOCK_SIZE_M"], E - ) compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16 - invoke_fused_moe_kernel( - hidden_states, - w1, - intermediate_cache1, - a1_scale, - w1_scale, - topk_weights, - topk_ids, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - False, - topk_ids.shape[1], - config, - compute_type=compute_type, - use_fp8=use_fp8, - ) + if inplace: + out_hidden_states = hidden_states + else: + out_hidden_states = torch.empty_like(hidden_states) - ops.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + for chunk in range((num_tokens // CHUNK_SIZE) + 1): + begin_chunk_idx, end_chunk_idx = ( + chunk * CHUNK_SIZE, + min((chunk + 1) * CHUNK_SIZE, num_tokens), + ) + curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] + tokens_in_chunk, _ = curr_hidden_states.shape + + if tokens_in_chunk == 0: + break + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. + intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] + intermediate_cache2 = intermediate_cache2[:tokens_in_chunk] + intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] + config = get_config_func(tokens_in_chunk) + + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] + + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + curr_topk_ids, config["BLOCK_SIZE_M"], E + ) - invoke_fused_moe_kernel( - intermediate_cache2, - w2, - intermediate_cache3, - a2_scale, - w2_scale, - topk_weights, - topk_ids, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - True, - 1, - config, - compute_type=compute_type, - use_fp8=use_fp8, - ) + invoke_fused_moe_kernel( + curr_hidden_states, + w1, + intermediate_cache1, + a1_scale, + w1_scale, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + topk_ids.shape[1], + config, + compute_type=compute_type, + use_fp8=use_fp8, + ) - if inplace: - return torch.sum( + ops.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + + invoke_fused_moe_kernel( + intermediate_cache2, + w2, + intermediate_cache3, + a2_scale, + w2_scale, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + True, + 1, + config, + compute_type=compute_type, + use_fp8=use_fp8, + ) + + torch.sum( intermediate_cache3.view(*intermediate_cache3.shape), dim=1, - out=hidden_states, + out=out_hidden_states[begin_chunk_idx:end_chunk_idx], ) - return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + return out_hidden_states def fused_moe( @@ -521,6 +595,9 @@ def fused_moe( renormalize: bool, inplace: bool = False, override_config: Optional[Dict[str, Any]] = None, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, use_fp8: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, @@ -543,6 +620,10 @@ def fused_moe( Defaults to False. - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration. + - num_expert_group: Optional[int]: additional parameter for grouped_topk + - topk_group: Optional[int]: additional parameter for grouped_topk + - use_grouped_topk: If True, use grouped_topk instead of fused_topk + note: Deepseekv2 model uses grouped_topk - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner products for w1 and w2. Defaults to False. - w1_scale (Optional[torch.Tensor]): Optional scale to be used for @@ -556,12 +637,18 @@ def fused_moe( # Check constraints. assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" - if hasattr(ops, "topk_softmax"): - topk_weights, topk_ids = fused_topk( - hidden_states, gating_output, topk, renormalize + if use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + topk_weights, topk_ids = grouped_topk( + hidden_states, + gating_output, + topk, + renormalize, + num_expert_group, + topk_group, ) else: - topk_weights, topk_ids = fused_topk_v0_4_3( + topk_weights, topk_ids = fused_topk( hidden_states, gating_output, topk, renormalize ) @@ -579,33 +666,3 @@ def fused_moe( a1_scale=a1_scale, a2_scale=a2_scale, ) - - -def fused_topk_v0_4_3( - hidden_states: torch.Tensor, - gating_output: torch.Tensor, - topk: int, - renormalize: bool, -): - import vllm._moe_C as moe_kernels - - M, _ = hidden_states.shape - - topk_weights = torch.empty( - M, topk, dtype=torch.float32, device=hidden_states.device - ) - topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device) - token_expert_indicies = torch.empty( - M, topk, dtype=torch.int32, device=hidden_states.device - ) - moe_kernels.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) - del token_expert_indicies # Not used. Will be used in the future. - if renormalize: - topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) - - return topk_weights, topk_ids diff --git a/python/sglang/srt/layers/fused_moe/layer.py b/python/sglang/srt/layers/fused_moe/layer.py new file mode 100644 index 0000000000..0b17c14ffd --- /dev/null +++ b/python/sglang/srt/layers/fused_moe/layer.py @@ -0,0 +1,587 @@ +# Adapted from +# https://github.com/vllm-project/vllm/tree/v0.5.4/vllm/model_executor/layers/fused_moe +from abc import abstractmethod +from typing import List, Optional, Tuple + +import torch +from vllm.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.fp8 import Fp8Config +from vllm.model_executor.utils import set_weight_attrs + +logger = init_logger(__name__) + + +class FusedMoEMethodBase(QuantizeMethodBase): + + @abstractmethod + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + raise NotImplementedError + + @abstractmethod + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + ) -> torch.Tensor: + raise NotImplementedError + + +class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): + """MoE method without quantization.""" + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size, hidden_size, dtype=params_dtype + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, hidden_size, intermediate_size, dtype=params_dtype + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + ) -> torch.Tensor: + return self.forward( + x, + layer.w13_weight, + layer.w2_weight, + router_logits, + top_k, + renormalize, + use_grouped_topk, + num_expert_group, + topk_group, + ) + + def forward_cuda( + self, + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool, + num_expert_group: Optional[int], + topk_group: Optional[int], + ) -> torch.Tensor: + from sglang.srt.layers.fused_moe.fused_moe import fused_moe + + return fused_moe( + x, + w1, + w2, + router_logits, + top_k, + renormalize=renormalize, + inplace=True, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + ) + + def forward_cpu(self, *args, **kwargs): + raise NotImplementedError("The CPU backend currently does not support MoE.") + + def forward_tpu( + self, + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool, + num_expert_group: Optional[int], + topk_group: Optional[int], + ) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe + + assert not use_grouped_topk + assert num_expert_group is None + assert topk_group is None + return fused_moe(x, w1, w2, router_logits, top_k, renormalize) + + +class FusedMoE(torch.nn.Module): + """FusedMoE layer for MoE models. + + This layer contains both MergedColumnParallel weights (gate_up_proj / + w13) and RowParallelLinear weights (down_proj/ w2). + + Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We + copy that naming convention here and handle any remapping in the + load_weights function in each model implementation. + + Args: + num_experts: Number of experts in the model + top_k: Number of experts selected for each token + hidden_size: Input hidden state size of the transformer + intermediate_size: Intermediate size of the experts + params_dtype: Data type for the parameters. + reduce_results: Whether to all all_reduce on the output of the layer + renomalize: Whether to renormalize the logits in the fused_moe kernel + quant_config: Quantization configure. + """ + + def __init__( + self, + num_experts: int, + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = False, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + tp_size: Optional[int] = None, + prefix: str = "", + ): + super().__init__() + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + + self.tp_size = ( + tp_size if tp_size is not None else get_tensor_model_parallel_world_size() + ) + self.top_k = top_k + self.num_experts = num_experts + self.intermediate_size_per_partition = intermediate_size // self.tp_size + self.reduce_results = reduce_results + self.renormalize = renormalize + self.use_grouped_topk = use_grouped_topk + if self.use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + self.num_expert_group = num_expert_group + self.topk_group = topk_group + + if quant_config is None: + self.quant_method: Optional[QuantizeMethodBase] = ( + UnquantizedFusedMoEMethod() + ) + else: + if isinstance(quant_config, Fp8Config): + self.quant_method = Fp8MoEMethod(quant_config) + else: + self.quant_method = quant_config.get_quant_method(self, prefix) + assert self.quant_method is not None + + self.quant_method.create_weights( + layer=self, + num_experts=num_experts, + hidden_size=hidden_size, + intermediate_size=self.intermediate_size_per_partition, + params_dtype=params_dtype, + weight_loader=self.weight_loader, + ) + + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: int, + expert_id: int, + pre_sharded: bool, + ): + param_data = param.data + + # Input scales can be loaded directly and should be equal. + if "input_scale" in weight_name: + if ( + param_data[expert_id] != 1 + and (param_data[expert_id] - loaded_weight).abs() > 1e-5 + ): + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param_data[expert_id]} " + f"vs. {loaded_weight}" + ) + param_data[expert_id] = loaded_weight + # Weight scales + elif "weight_scale" in weight_name: + # If we are in merged column case (gate_up_proj) + # shard_id 0 == gate_proj / w1 + # shard_id 2 == up_proj / w3 + if shard_id == 0 or shard_id == 2: + # We have to keep the weight scales of w1 and w3 because + # we need to re-quantize w1/w3 weights after weight loading. + idx = 0 if shard_id == 0 else 1 + param_data[expert_id][idx] = loaded_weight + # If we are in the row parallel case (down_proj) + # shard_id 1 == down_proj / w2 + else: + param_data[expert_id] = loaded_weight + # Weights + else: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.intermediate_size_per_partition + if pre_sharded: + shard = slice(None) + else: + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + + # w1, gate_proj case: Load into first shard of w13. + if shard_id == 0: + param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] + # w3, up_proj case: Load into second shard of w13. + elif shard_id == 2: + param_data[expert_id, shard_size : 2 * shard_size, :] = loaded_weight[ + shard, : + ] + # w2, down_proj case: Load into only shard of w2. + elif shard_id == 1: + param_data[expert_id, :, :] = loaded_weight[:, shard] + else: + raise ValueError(f"Shard id must be in [0,1,2] but got {shard_id}") + + def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): + assert self.quant_method is not None + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + self, + x=hidden_states, + router_logits=router_logits, + top_k=self.top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + num_expert_group=self.num_expert_group, + topk_group=self.topk_group, + ) + + if self.reduce_results and self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) + + return final_hidden_states + + @classmethod + def make_expert_params_mapping( + cls, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + ) -> List[Tuple[str, str, int, int]]: + + gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name] + gate_down_up = [ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name] + + return ( + [ + # These are the weight scales for the experts + # (param_name, weight_name, expert_id, shard_id) + ( + ( + "experts.w13_scale" + if weight_name in gate_up + else "experts.w2_scale" + ), + f"experts.{expert_id}.{weight_name}.weight_scale", + expert_id, + shard_id, + ) + for expert_id in range(num_experts) + for shard_id, weight_name in enumerate(gate_down_up) + ] + + [ + # These are the weights for the experts + # (param_name, weight_name, expert_id, shard_id) + ( + ( + "experts.w13_weight" + if weight_name in gate_up + else "experts.w2_weight" + ), + f"experts.{expert_id}.{weight_name}.weight", + expert_id, + shard_id, + ) + for expert_id in range(num_experts) + for shard_id, weight_name in enumerate(gate_down_up) + ] + + [ + # These are the weight scales for the experts + # (param_name, weight_name, expert_id, shard_id) + ( + ( + "experts.a13_scale" + if weight_name in gate_up + else "experts.a2_scale" + ), + f"experts.{expert_id}.{weight_name}.input_scale", + expert_id, + shard_id, + ) + for expert_id in range(num_experts) + for shard_id, weight_name in enumerate(gate_down_up) + ] + ) + + +import torch +from torch.nn import Module +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + all_close_1d, + per_tensor_dequantize, +) +from vllm.utils import print_warning_once + + +class Fp8MoEMethod(FusedMoEMethodBase): + """MoE method for FP8. + Supports loading FP8 checkpoints with static weight scale and + dynamic/static activation scale. + + Also supports loading quantized FP16/BF16 model checkpoints with dynamic + activation scaling. The weight scaling factor will be initialized after + the model weights are loaded. + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: Fp8Config): + self.quant_config = quant_config + + def create_weights( + self, + layer: Module, + num_experts: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + + if self.quant_config.is_checkpoint_fp8_serialized: + params_dtype = torch.float8_e4m3fn + + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, 2 * intermediate_size, hidden_size, dtype=params_dtype + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, hidden_size, intermediate_size, dtype=params_dtype + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_scale", w13_scale) + + w2_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_scale", w2_scale) + + # If loading fp8 checkpoint, pass the weight loaders. + # If loading an fp16 checkpoint, do not (we will quantize in + # process_weights_after_loading() + if self.quant_config.is_checkpoint_fp8_serialized: + set_weight_attrs(w13_scale, extra_weight_attrs) + set_weight_attrs(w2_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.quant_config.activation_scheme == "static": + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "Found static activation scheme for checkpoint that " + "was not serialized fp8." + ) + + a13_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("a13_scale", a13_scale) + set_weight_attrs(a13_scale, extra_weight_attrs) + + a2_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("a2_scale", a2_scale) + set_weight_attrs(a2_scale, extra_weight_attrs) + else: + layer.a13_scale = None + layer.a2_scale = None + + def process_weights_after_loading(self, layer: Module) -> None: + + # If checkpoint is fp16, quantize in place. + if not self.quant_config.is_checkpoint_fp8_serialized: + w13_weight = torch.empty_like( + layer.w13_weight.data, dtype=torch.float8_e4m3fn + ) + w2_weight = torch.empty_like( + layer.w2_weight.data, dtype=torch.float8_e4m3fn + ) + + # Re-initialize w13_scale because we directly quantize + # merged w13 weights and generate a single scaling factor. + layer.w13_scale = torch.nn.Parameter( + torch.ones( + layer.num_experts, dtype=torch.float32, device=w13_weight.device + ), + requires_grad=False, + ) + for expert in range(layer.num_experts): + w13_weight[expert, :, :], layer.w13_scale[expert] = ( + ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :]) + ) + w2_weight[expert, :, :], layer.w2_scale[expert] = ops.scaled_fp8_quant( + layer.w2_weight.data[expert, :, :] + ) + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + return + + # If checkpoint is fp8, we need to handle that the + # MoE kernels require single activation scale and single weight + # scale for w13 per expert. + else: + # Fp8 moe kernels require a single activation scale. + # We take the max of all the scales in case they differ. + if self.quant_config.activation_scheme == "static": + if layer.a13_scale is None or layer.a2_scale is None: + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None." + ) + if not all_close_1d(layer.a13_scale) or not all_close_1d( + layer.a2_scale + ): + print_warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer. " + ) + layer.a13_scale = torch.nn.Parameter( + layer.a13_scale.max(), requires_grad=False + ) + layer.a2_scale = torch.nn.Parameter( + layer.a2_scale.max(), requires_grad=False + ) + + # Fp8 moe kernel needs single weight scale for w13 per expert. + # We take the max then dequant and requant each expert. + assert layer.w13_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_scale.max(dim=1).values + for expert_id in range(layer.num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start : start + shard_size, :], + layer.w13_scale[expert_id][shard_id], + ) + layer.w13_weight[expert_id][start : start + shard_size, :], _ = ( + ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + ) + start += shard_size + + layer.w13_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False) + return + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + ) -> torch.Tensor: + + from sglang.srt.layers.fused_moe.fused_moe import fused_moe + + return fused_moe( + x, + layer.w13_weight, + layer.w2_weight, + router_logits, + top_k, + renormalize=renormalize, + inplace=True, + use_fp8=True, + w1_scale=layer.w13_scale, + w2_scale=layer.w2_scale, + a1_scale=layer.a13_scale, + a2_scale=layer.a2_scale, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + ) diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index cf5045fda5..541fa0f153 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -164,9 +164,9 @@ def forward( last_logits = last_logits[:, : self.config.vocab_size].float() if hasattr(self.config, "final_logit_softcapping"): - last_logits /= self.config.final_logit_softcapping + last_logits.div_(self.config.final_logit_softcapping) last_logits = torch.tanh(last_logits) - last_logits *= self.config.final_logit_softcapping + last_logits.mul_(self.config.final_logit_softcapping) # Return only last_logits if logprob is not requested if not logits_metadata.return_logprob: @@ -209,9 +209,9 @@ def forward( all_logits = all_logits[:, : self.config.vocab_size].float() if hasattr(self.config, "final_logit_softcapping"): - all_logits /= self.config.final_logit_softcapping + all_logits.div_(self.config.final_logit_softcapping) all_logits = torch.tanh(all_logits) - all_logits *= self.config.final_logit_softcapping + all_logits.mul_(self.config.final_logit_softcapping) all_logprobs = all_logits del all_logits, hidden_states diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 34a40c7d71..9da284da65 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -53,7 +53,7 @@ from sglang.srt.utils import ( get_available_gpu_memory, is_generation_model, - is_llama3_405b_fp8, + is_llama3_405b_fp8_head_16, is_multimodal_model, monkey_patch_vllm_dummy_weight_loader, monkey_patch_vllm_p2p_access_check, @@ -158,7 +158,7 @@ def load_model(self): skip_tokenizer_init=True, ) - if is_llama3_405b_fp8(self.model_config) and self.tp_size <= 8: + if is_llama3_405b_fp8_head_16(self.model_config) and self.tp_size <= 8: # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints self.model_config.hf_config.num_key_value_heads = 8 vllm_model_config.hf_config.num_key_value_heads = 8 diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 13d4330d4c..eff746f1dd 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -16,20 +16,17 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1 """Inference-only Grok1 model.""" +import warnings from typing import Iterable, List, Optional, Tuple -import numpy as np import torch import torch.nn.functional as F -import tqdm from torch import nn from transformers import PretrainedConfig -from vllm import _custom_ops as ops from vllm.config import CacheConfig from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -37,7 +34,6 @@ RowParallelLinear, ) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -from vllm.model_executor.layers.quantization.fp8 import Fp8Config from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, @@ -45,141 +41,13 @@ ) from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import print_warning_once -from sglang.srt.layers.fused_moe import fused_moe +from sglang.srt.layers.fused_moe import FusedMoE from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata -use_fused = True - - -class Grok1MLP(nn.Module): - def __init__( - self, - num_experts: int, - hidden_size: int, - intermediate_size: int, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: - super().__init__() - self.num_experts = num_experts - self.ffn_dim = intermediate_size - self.hidden_dim = hidden_size - - self.w1 = ReplicatedLinear( - self.hidden_dim, self.ffn_dim, bias=False, quant_config=quant_config - ) - self.w2 = ReplicatedLinear( - self.ffn_dim, self.hidden_dim, bias=False, quant_config=quant_config - ) - self.w3 = ReplicatedLinear( - self.hidden_dim, self.ffn_dim, bias=False, quant_config=quant_config - ) - - self.act_fn = nn.GELU() - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - w1_out, _ = self.w1(hidden_states) - w1_out = self.act_fn(w1_out) - w3_out, _ = self.w3(hidden_states) - current_hidden_states = w1_out * w3_out - current_hidden_states, _ = self.w2(current_hidden_states) - return current_hidden_states - - -class Grok1MoEUnfused(nn.Module): - def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() - self.config = config - self.rank = get_tensor_model_parallel_rank() - self.tp_size = get_tensor_model_parallel_world_size() - self.num_total_experts = config.num_local_experts - self.top_k = config.num_experts_per_tok - if self.tp_size > self.num_total_experts: - raise ValueError( - f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {self.num_total_experts}." - ) - # Split experts equally between ranks - self.expert_indicies = np.array_split( - range(self.num_total_experts), self.tp_size - )[self.rank].tolist() - if not self.expert_indicies: - raise ValueError(f"Rank {self.rank} has no experts assigned to it.") - - self.experts = nn.ModuleList( - [ - ( - Grok1MLP( - self.num_total_experts, - config.hidden_size, - config.intermediate_size, - quant_config=quant_config, - ) - if idx in self.expert_indicies - else None - ) - for idx in range(self.num_total_experts) - ] - ) - self.gate = ReplicatedLinear( - config.hidden_size, self.num_total_experts, bias=False, quant_config=None - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - router_logits, _ = self.gate(hidden_states) - router_logits = 30 * F.tanh(router_logits / 30) - - routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) - routing_weights, selected_experts = torch.topk( - routing_weights, self.top_k, dim=-1 - ) - routing_weights = routing_weights.to(hidden_states.dtype) - hidden_dim = hidden_states.shape[1] - - final_hidden_states = torch.zeros( - (hidden_states.shape[0], hidden_dim), - dtype=hidden_states.dtype, - device=hidden_states.device, - ) - expert_mask = torch.nn.functional.one_hot( - selected_experts, num_classes=self.num_total_experts - ).permute(2, 1, 0) - - for expert_idx in self.expert_indicies: - expert_layer = self.experts[expert_idx] - idx, top_x = torch.where(expert_mask[expert_idx]) - - if top_x.shape[0] == 0: - continue - - # in torch it is faster to index using lists than torch tensors - top_x_list = top_x.tolist() - idx_list = idx.tolist() - - # Index the correct hidden states and compute the expert hidden state for - # the current expert. We need to make sure to multiply the output hidden - # states by `routing_weights` on the corresponding tokens (top-1 and top-2) - current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim) - current_hidden_states = ( - expert_layer(current_state) - * routing_weights[top_x_list, idx_list, None] - ) - - # However `index_add_` only support torch tensors for indexing so we'll use - # the `top_x` tensor here. - final_hidden_states.index_add_(0, top_x, current_hidden_states) - - return tensor_model_parallel_all_reduce(final_hidden_states) - class Grok1MoE(nn.Module): """A tensor-parallel MoE implementation for Grok1 that shards each expert @@ -197,221 +65,42 @@ def __init__( hidden_size: int, intermediate_size: int, params_dtype: Optional[torch.dtype] = None, - tp_size: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, + tp_size: Optional[int] = None, ): super().__init__() - self.tp_size = tp_size or get_tensor_model_parallel_world_size() - self.num_total_experts = num_experts - self.top_k = top_k self.hidden_size = hidden_size - self.intermediate_size = intermediate_size // self.tp_size - self.quant_config = quant_config - - # FIXME(pcmoritz): Make this more general to support different - # quantization schemes - self.use_fp8 = isinstance(quant_config, Fp8Config) - - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype # Gate always runs at half / full precision for now. self.gate = ReplicatedLinear( - self.hidden_size, - self.num_total_experts, + hidden_size, + num_experts, bias=False, - params_dtype=self.params_dtype, + params_dtype=params_dtype, quant_config=None, ) - if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized: - params_dtype = torch.float8_e4m3fn - - self.w13_weight = nn.Parameter( - torch.empty( - self.num_total_experts, - 2 * self.intermediate_size, - self.hidden_size, - dtype=params_dtype, - ) - ) - self.w2_weight = nn.Parameter( - torch.empty( - self.num_total_experts, - self.hidden_size, - self.intermediate_size, - dtype=params_dtype, - ) - ) - - set_weight_attrs( - self.w13_weight, - { - "weight_loader": self.weight_loader, - }, - ) - set_weight_attrs( - self.w2_weight, - { - "weight_loader": self.weight_loader, - }, + self.experts = FusedMoE( + num_experts=num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + params_dtype=params_dtype, + reduce_results=True, + renormalize=False, + quant_config=quant_config, + tp_size=tp_size, ) - # Used for fp8. - self.w13_scale = None - self.w2_scale = None - self.a13_scale = None - self.a2_scale = None - - if self.use_fp8: - # WEIGHT_SCALE (for fp8) - self.w13_scale = nn.Parameter( - torch.ones(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - self.w2_scale = nn.Parameter( - torch.ones(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - - # If loading fp8 checkpoint, pass the weight loaders. - # If loading an fp16 checkpoint, do not (we will quantize in - # process_weights_after_loading() - if quant_config.is_checkpoint_fp8_serialized: - set_weight_attrs( - self.w13_scale, - { - "weight_loader": self.weight_loader, - }, - ) - set_weight_attrs( - self.w2_scale, - { - "weight_loader": self.weight_loader, - }, - ) - - # ACT_SCALE (for fp8) - if quant_config.activation_scheme == "static": - if not quant_config.is_checkpoint_fp8_serialized: - raise ValueError( - "Found static activation scheme for checkpoint that " - "was not serialized fp8." - ) - self.a13_scale = nn.Parameter( - torch.zeros(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - self.a2_scale = nn.Parameter( - torch.zeros(self.num_total_experts, dtype=torch.float32), - requires_grad=False, - ) - - set_weight_attrs( - self.a13_scale, - { - "weight_loader": self.weight_loader, - }, - ) - set_weight_attrs( - self.a2_scale, - { - "weight_loader": self.weight_loader, - }, - ) - - def weight_loader( - self, - param: nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - expert_id: int, - pre_sharded: bool, - ): - param_data = param.data - shard_size = self.intermediate_size - if pre_sharded: - # The weight is already sharded. Readl the full shard - shard = slice(None) - else: - tp_rank = get_tensor_model_parallel_rank() - shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) - if weight_name.endswith("w1.weight"): - param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] - if weight_name.endswith("w3.weight"): - param_data[expert_id, shard_size : 2 * shard_size, :] = loaded_weight[ - shard, : - ] - if weight_name.endswith("w2.weight"): - param_data[expert_id, :, :] = loaded_weight[:, shard] - if "act_scale" in weight_name or "weight_scale" in weight_name: - param_data[expert_id] = loaded_weight - - def process_weights_after_loading(self): - # Fp8 is the only case where we need to process after loading. - if not self.use_fp8: - return - - # If checkpoint is fp16, quantize here. - if not self.quant_config.is_checkpoint_fp8_serialized: - w13_weight = torch.empty_like( - self.w13_weight.data, dtype=torch.float8_e4m3fn - ) - w2_weight = torch.empty_like(self.w2_weight.data, dtype=torch.float8_e4m3fn) - for expert in range(self.num_total_experts): - w13_weight[expert, :, :], self.w13_scale[expert] = ops.scaled_fp8_quant( - self.w13_weight.data[expert, :, :] - ) - w2_weight[expert, :, :], self.w2_scale[expert] = ops.scaled_fp8_quant( - self.w2_weight.data[expert, :, :] - ) - self.w13_weight = nn.Parameter(w13_weight, requires_grad=False) - self.w2_weight = nn.Parameter(w2_weight, requires_grad=False) - - # If checkpoint is fp8 + static, cleanup act_scales. - # Since state_dict has an act_scale per expert but our kernels - # are passed one act_scale shared across all experts. - elif self.quant_config.activation_scheme == "static": - if self.a13_scale is None or self.a2_scale is None: - raise ValueError( - "QuantConfig has static quantization, but found " - "activation scales are None." - ) - - if not all_close_1d(self.a13_scale) or not all_close_1d(self.a2_scale): - print_warning_once( - "Found act_scales that are not equal for fp8 MoE layer. " - "Using the maximum across experts for each layer. " - ) - - self.a13_scale = nn.Parameter(self.a13_scale.max(), requires_grad=False) - self.a2_scale = nn.Parameter(self.a2_scale.max(), requires_grad=False) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_size = hidden_states.shape + # NOTE: hidden_states can have either 1D or 2D shape. + orig_shape = hidden_states.shape hidden_states = hidden_states.view(-1, self.hidden_size) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe( - hidden_states, - self.w13_weight, - self.w2_weight, - router_logits, - self.top_k, - renormalize=False, - inplace=True, - use_fp8=self.use_fp8, - w1_scale=self.w13_scale, - w2_scale=self.w2_scale, - a1_scale=self.a13_scale, - a2_scale=self.a2_scale, - ) - - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) - - return final_hidden_states.view(num_tokens, hidden_size) + router_logits = 30.0 * F.tanh(router_logits / 30.0) + final_hidden_states = self.experts(hidden_states, router_logits) + return final_hidden_states.view(orig_shape) class Grok1Attention(nn.Module): @@ -478,6 +167,7 @@ def __init__( layer_id=layer_id, logit_cap=logit_cap, ) + # TODO(lianmin): load logit cap from config def forward( self, @@ -502,7 +192,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = Grok1Attention( hidden_size=self.hidden_size, @@ -513,18 +203,13 @@ def __init__( rope_theta=rope_theta, quant_config=quant_config, ) - if use_fused: - self.block_sparse_moe = Grok1MoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - quant_config=quant_config, - ) - else: - self.block_sparse_moe = Grok1MoEUnfused( - config=config, quant_config=quant_config - ) + self.block_sparse_moe = Grok1MoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + quant_config=quant_config, + ) self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.pre_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -536,6 +221,7 @@ def forward( hidden_states: torch.Tensor, input_metadata: InputMetadata, ) -> torch.Tensor: + # Self Attention hidden_states = ( self.post_attn_norm( self.self_attn( @@ -547,11 +233,11 @@ def forward( + hidden_states ) + # Fully Connected hidden_states = ( self.post_moe_norm(self.block_sparse_moe(self.pre_moe_norm(hidden_states))) + hidden_states ) - return hidden_states @@ -593,7 +279,6 @@ def forward( for i in range(len(self.layers)): hidden_states = self.layers[i](positions, hidden_states, input_metadata) - hidden_states = self.norm(hidden_states) hidden_states.mul_(self.config.output_multiplier_scale) return hidden_states @@ -615,8 +300,8 @@ def __init__( # Monkey patch _prepare_weights to load pre-sharded weights setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights) + warnings.filterwarnings("ignore", category=FutureWarning) - @torch.no_grad() def forward( self, input_ids: torch.Tensor, @@ -637,50 +322,17 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] - if use_fused: - expert_params_mapping = ( - [ - # These are the weight scales for the experts - # (param_name, weight_name, expert_id) - ( - "w13_scale" if weight_name in ["w1", "w3"] else "w2_scale", - f"experts.{expert_id}.{weight_name}.weight_scale", - expert_id, - ) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] - + [ - # These are the weights for the experts - # (param_name, weight_name, expert_id) - ( - "w13_weight" if weight_name in ["w1", "w3"] else "w2_weight", - f"experts.{expert_id}.{weight_name}.weight", - expert_id, - ) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] - + [ - # These are the activation scales for the experts - # (param_name, weight_name, expert_id) - ( - "a13_scale" if weight_name in ["w1", "w3"] else "a2_scale", - f"experts.{expert_id}.{weight_name}.act_scale", - expert_id, - ) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] - ) - else: - expert_params_mapping = [] + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + ) params_dict = dict(self.named_parameters()) - if get_tensor_model_parallel_rank() == 0: - weights = tqdm.tqdm(weights, total=int(len(params_dict) * 3.4)) for name, loaded_weight in weights: - # print(get_tensor_model_parallel_rank(), name) if "rotary_emb.inv_freq" in name: continue @@ -691,21 +343,25 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: - for param_name, weight_name, expert_id in expert_params_mapping: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue name = name.replace(weight_name, param_name) + param = params_dict[name] weight_loader = param.weight_loader weight_loader( param, loaded_weight, weight_name, + shard_id=shard_id, expert_id=expert_id, pre_sharded=get_tensor_model_parallel_world_size() > 1, ) @@ -714,6 +370,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + if name is None: + continue + param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader @@ -721,11 +380,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight) -def all_close_1d(x: torch.Tensor) -> bool: - assert len(x.shape) == 1 - return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) - - old_prepare_weights = getattr(DefaultModelLoader, "_prepare_weights") diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index d11f6c9519..45de85d879 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 2d20881c8f..9761c851a5 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -35,7 +35,6 @@ import torch.distributed as dist from fastapi.responses import JSONResponse from packaging import version as pkg_version -from starlette.middleware.base import BaseHTTPMiddleware from torch.nn.parameter import Parameter from triton.runtime.cache import ( FileCacheManager, @@ -644,7 +643,7 @@ def set_ulimit(target_soft_limit=65535): logger.warn(f"Fail to set RLIMIT_NOFILE: {e}") -def is_llama3_405b_fp8(model_config): +def is_llama3_405b_fp8_head_16(model_config): """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads.""" if ( model_config.hf_config.architectures[0] == "LlamaForCausalLM" From 67c0d832a644090810a479d6d4655555a07d44a7 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 14 Aug 2024 20:25:39 +0800 Subject: [PATCH 025/118] docs: update pr template (#1099) --- .github/pull_request_template.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index acc9682d64..0926cfbe9c 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,16 +1,16 @@ -Thank you for your contribution, we really appreciate it. The following instructions will help improve your pull request and make it easier to receive feedback. If there are any items you don't understand, don't worry. Just submit the pull request and ask the maintainers for help. + ## Motivation -Please explain the motivation behind this PR and the goal you aim to achieve with it. + ## Modification -Briefly describe the changes made in this PR. + ## Checklist --[] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**. --[] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues. --[] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness. --[] Modify documentation as needed, such as docstrings or example tutorials. +- [ ] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**. +- [ ] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues. +- [ ] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness. +- [ ] Modify documentation as needed, such as docstrings or example tutorials. From a34dd86a7dd734ef95ba37a86ba929479bbbac64 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Wed, 14 Aug 2024 08:58:07 -0700 Subject: [PATCH 026/118] Use `dtype` to control generate (#1082) Co-authored-by: zhyncs --- benchmark/json_decode_regex/bench_other.py | 8 +- benchmark/json_decode_regex/bench_sglang.py | 12 +- python/sglang/api.py | 2 +- python/sglang/bench_latency.py | 2 +- .../sglang/lang/backend/runtime_endpoint.py | 109 ++++++++++-------- python/sglang/lang/ir.py | 6 +- python/sglang/srt/managers/schedule_batch.py | 13 +-- python/sglang/srt/managers/tp_worker.py | 8 +- python/sglang/srt/models/mixtral.py | 1 + python/sglang/srt/sampling_params.py | 4 - python/sglang/test/test_programs.py | 28 ++++- test/lang/test_srt_backend.py | 5 +- 12 files changed, 110 insertions(+), 88 deletions(-) diff --git a/benchmark/json_decode_regex/bench_other.py b/benchmark/json_decode_regex/bench_other.py index bbe22835a3..d80ea1de7e 100644 --- a/benchmark/json_decode_regex/bench_other.py +++ b/benchmark/json_decode_regex/bench_other.py @@ -6,11 +6,11 @@ from tqdm import tqdm -from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING +from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate from sglang.utils import dump_state_text, read_jsonl -REGEX_LIST = r"\[(" + REGEX_STRING + ", )*" + REGEX_STRING + r"\]" +REGEX_LIST = r"\[(" + REGEX_STR + ", )*" + REGEX_STR + r"\]" # fmt: off @@ -20,9 +20,9 @@ def json_decode(document, generate): s += "Here is the name, country, and symbol of the city in JSON format.\n" s += "{\n" s += ' "name": ' - s += generate(s, max_tokens=8, regex=REGEX_STRING + ",") + "\n" + s += generate(s, max_tokens=8, regex=REGEX_STR + ",") + "\n" s += ' "country": ' - s += generate(s, max_tokens=8, regex=REGEX_STRING + ",") + "\n" + s += generate(s, max_tokens=8, regex=REGEX_STR + ",") + "\n" s += ' "latitude": ' s += generate(s, max_tokens=8, regex=REGEX_FLOAT + ",") + "\n" s += ' "population": ' diff --git a/benchmark/json_decode_regex/bench_sglang.py b/benchmark/json_decode_regex/bench_sglang.py index 1964387229..462c77750c 100644 --- a/benchmark/json_decode_regex/bench_sglang.py +++ b/benchmark/json_decode_regex/bench_sglang.py @@ -3,14 +3,14 @@ import time import sglang as sgl -from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING +from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR from sglang.test.test_utils import ( add_common_sglang_args_and_parse, select_sglang_backend, ) from sglang.utils import dump_state_text, read_jsonl -REGEX_LIST = r"\[(" + REGEX_STRING + ", )*" + REGEX_STRING + r"\]" +REGEX_LIST = r"\[(" + REGEX_STR + ", )*" + REGEX_STR + r"\]" # fmt: off @sgl.function @@ -18,8 +18,8 @@ def json_warm_up(s): s += "The information about Hogwarts is in the following JSON format.\n" with s.var_scope("json_output"): s += "{\n" - s += ' "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STRING + ",") + "\n" - s += ' "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STRING + ",") + "\n" + s += ' "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STR + ",") + "\n" + s += ' "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STR + ",") + "\n" s += ' "latitude": ' + sgl.gen("latitude", max_tokens=8, regex=REGEX_FLOAT + ",") + "\n" s += ' "population": ' + sgl.gen("population", max_tokens=8, regex=REGEX_INT + ",") + "\n" s += ' "top 3 landmarks": ' + sgl.gen( "landmarks", max_tokens=24, regex=REGEX_LIST) + "\n" @@ -35,8 +35,8 @@ def json_decode(s, document): s += "Here is the name, country, and symbol of the city in JSON format.\n" with s.var_scope("json_output"): s += "{\n" - s += ' "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STRING + ",") + "\n" - s += ' "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STRING + ",") + "\n" + s += ' "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STR + ",") + "\n" + s += ' "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STR + ",") + "\n" s += ' "latitude": ' + sgl.gen("latitude", max_tokens=8, regex=REGEX_FLOAT + ",") + "\n" s += ' "population": ' + sgl.gen("population", max_tokens=8, regex=REGEX_INT + ",") + "\n" s += ' "top 3 landmarks": ' + sgl.gen( "landmarks", max_tokens=24, regex=REGEX_LIST) + "\n" diff --git a/python/sglang/api.py b/python/sglang/api.py index 5a177c36b0..2242b4a4c6 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -72,7 +72,7 @@ def gen( logprob_start_len: Optional[int] = None, top_logprobs_num: Optional[int] = None, return_text_in_logprobs: Optional[bool] = None, - dtype: Optional[type] = None, + dtype: Optional[Union[type, str]] = None, choices: Optional[List[str]] = None, choices_method: Optional[ChoicesSamplingMethod] = None, regex: Optional[str] = None, diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index e500d30d1c..dd86747e36 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -195,7 +195,7 @@ def extend(reqs, model_runner): token_to_kv_pool=model_runner.token_to_kv_pool, tree_cache=None, ) - batch.prepare_for_extend(model_runner.model_config.vocab_size, None) + batch.prepare_for_extend(model_runner.model_config.vocab_size) output = model_runner.forward(batch, ForwardMode.EXTEND) next_token_ids = batch.sample(output.next_token_logits) return next_token_ids, output.next_token_logits, batch diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py index 7f0db5b359..5012f646ea 100644 --- a/python/sglang/lang/backend/runtime_endpoint.py +++ b/python/sglang/lang/backend/runtime_endpoint.py @@ -1,21 +1,23 @@ import json +import warnings from typing import List, Optional from sglang.global_config import global_config from sglang.lang.backend.base_backend import BaseBackend from sglang.lang.chat_template import get_chat_template_by_model_path -from sglang.lang.choices import ( - ChoicesDecision, - ChoicesSamplingMethod, - token_length_normalized, -) +from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod from sglang.lang.interpreter import StreamExecutor -from sglang.lang.ir import SglSamplingParams +from sglang.lang.ir import ( + REGEX_BOOL, + REGEX_FLOAT, + REGEX_INT, + REGEX_STR, + SglSamplingParams, +) from sglang.utils import http_request class RuntimeEndpoint(BaseBackend): - def __init__( self, base_url: str, @@ -95,32 +97,52 @@ def fill_image(self, s: StreamExecutor): ) self._assert_success(res) + def _handle_dtype_to_regex(self, sampling_params: SglSamplingParams): + if sampling_params.dtype is None: + return + + if sampling_params.stop == (): + sampling_params.stop = [] + + dtype_regex = None + if sampling_params.dtype in ["int", int]: + + dtype_regex = REGEX_INT + sampling_params.stop.extend([" ", "\n"]) + elif sampling_params.dtype in ["float", float]: + + dtype_regex = REGEX_FLOAT + sampling_params.stop.extend([" ", "\n"]) + elif sampling_params.dtype in ["str", str]: + + dtype_regex = REGEX_STR + elif sampling_params.dtype in ["bool", bool]: + + dtype_regex = REGEX_BOOL + else: + raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}") + + if dtype_regex is not None and sampling_params.regex is not None: + warnings.warn( + f"Both dtype and regex are set. Only dtype will be used. dtype: {sampling_params.dtype}, regex: {sampling_params.regex}" + ) + + sampling_params.regex = dtype_regex + def generate( self, s: StreamExecutor, sampling_params: SglSamplingParams, ): - if sampling_params.dtype is None: - data = { - "text": s.text_, - "sampling_params": { - "skip_special_tokens": global_config.skip_special_tokens_in_output, - "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out, - **sampling_params.to_srt_kwargs(), - }, - } - elif sampling_params.dtype in [int, "int"]: - data = { - "text": s.text_, - "sampling_params": { - "skip_special_tokens": global_config.skip_special_tokens_in_output, - "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out, - "dtype": "int", - **sampling_params.to_srt_kwargs(), - }, - } - else: - raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}") + self._handle_dtype_to_regex(sampling_params) + data = { + "text": s.text_, + "sampling_params": { + "skip_special_tokens": global_config.skip_special_tokens_in_output, + "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out, + **sampling_params.to_srt_kwargs(), + }, + } for item in [ "return_logprob", @@ -151,27 +173,16 @@ def generate_stream( s: StreamExecutor, sampling_params: SglSamplingParams, ): - if sampling_params.dtype is None: - data = { - "text": s.text_, - "sampling_params": { - "skip_special_tokens": global_config.skip_special_tokens_in_output, - "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out, - **sampling_params.to_srt_kwargs(), - }, - } - elif sampling_params.dtype in [int, "int"]: - data = { - "text": s.text_, - "sampling_params": { - "skip_special_tokens": global_config.skip_special_tokens_in_output, - "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out, - "dtype": "int", - **sampling_params.to_srt_kwargs(), - }, - } - else: - raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}") + self._handle_dtype_to_regex(sampling_params) + + data = { + "text": s.text_, + "sampling_params": { + "skip_special_tokens": global_config.skip_special_tokens_in_output, + "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out, + **sampling_params.to_srt_kwargs(), + }, + } for item in [ "return_logprob", diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 135110c1e0..0166b86870 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -8,10 +8,10 @@ from sglang.global_config import global_config from sglang.lang.choices import ChoicesSamplingMethod -REGEX_INT = r"[-+]?[0-9]+" -REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+" +REGEX_INT = r"[-+]?[0-9]+[ \n]*" +REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+[ \n]*" REGEX_BOOL = r"(True|False)" -REGEX_STRING = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg +REGEX_STR = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg @dataclasses.dataclass diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index a461fa1812..9037f5a6ea 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -383,7 +383,7 @@ def alloc_token_slots(self, num_tokens: int): return out_cache_loc - def batch_sampling_params(self, vocab_size, int_token_logit_bias): + def batch_sampling_params(self, vocab_size): device = "cuda" bs, reqs = self.batch_size(), self.reqs self.temperatures = torch.tensor( @@ -419,15 +419,8 @@ def batch_sampling_params(self, vocab_size, int_token_logit_bias): # Handle logit bias but only allocate when needed self.logit_bias = None - for i in range(bs): - if reqs[i].sampling_params.dtype == "int": - if self.logit_bias is None: - self.logit_bias = torch.zeros( - (bs, vocab_size), dtype=torch.float32, device=device - ) - self.logit_bias[i][: len(int_token_logit_bias)] = int_token_logit_bias - def prepare_for_extend(self, vocab_size: int, int_token_logit_bias: torch.Tensor): + def prepare_for_extend(self, vocab_size: int): bs = self.batch_size() reqs = self.reqs input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs] @@ -466,7 +459,7 @@ def prepare_for_extend(self, vocab_size: int, int_token_logit_bias: torch.Tensor self.out_cache_loc = out_cache_loc self.top_logprobs_nums = [r.top_logprobs_num for r in reqs] - self.batch_sampling_params(vocab_size, int_token_logit_bias) + self.batch_sampling_params(vocab_size) def check_decode_mem(self): bs = self.batch_size() diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index a8b952361d..4d869c5919 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -54,7 +54,6 @@ from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( - get_int_token_logit_bias, is_multimodal_model, set_random_seed, suppress_other_loggers, @@ -132,9 +131,6 @@ def __init__( ), self.model_runner.req_to_token_pool.size - 1, ) - self.int_token_logit_bias = torch.tensor( - get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size) - ) self.max_req_input_len = min( self.model_config.context_len - 1, self.max_total_num_tokens - 1, @@ -442,9 +438,7 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: def forward_prefill_batch(self, batch: ScheduleBatch): # Build batch tensors - batch.prepare_for_extend( - self.model_config.vocab_size, self.int_token_logit_bias - ) + batch.prepare_for_extend(self.model_config.vocab_size) if self.model_runner.is_generation: # Forward and sample the next tokens diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 45de85d879..d11f6c9519 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) diff --git a/python/sglang/srt/sampling_params.py b/python/sglang/srt/sampling_params.py index 29067dc851..6a8823cc4d 100644 --- a/python/sglang/srt/sampling_params.py +++ b/python/sglang/srt/sampling_params.py @@ -36,7 +36,6 @@ def __init__( ignore_eos: bool = False, skip_special_tokens: bool = True, spaces_between_special_tokens: bool = True, - dtype: Optional[str] = None, regex: Optional[str] = None, n: int = 1, ) -> None: @@ -53,7 +52,6 @@ def __init__( self.ignore_eos = ignore_eos self.skip_special_tokens = skip_special_tokens self.spaces_between_special_tokens = spaces_between_special_tokens - self.dtype = dtype self.regex = regex self.n = n @@ -63,8 +61,6 @@ def __init__( self.top_k = 1 if self.top_k == -1: self.top_k = 1 << 30 # whole vocabulary - if self.dtype == "int": - self.stop_strs = [" ", "\n"] def verify(self): if self.temperature < 0.0: diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py index 7c7c9bdcb1..6e39f0aa99 100644 --- a/python/sglang/test/test_programs.py +++ b/python/sglang/test/test_programs.py @@ -103,13 +103,13 @@ def decode_int(s): def test_decode_json_regex(): @sgl.function def decode_json(s): - from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING + from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR s += "Generate a JSON object to describe the basic city information of Paris.\n" with s.var_scope("json_output"): s += "{\n" - s += ' "name": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n" + s += ' "name": ' + sgl.gen(regex=REGEX_STR + ",") + "\n" s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n" s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n" s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n" @@ -359,6 +359,30 @@ def regex_gen(s): assert re.match(regex, answer) +def test_dtype_gen(): + @sgl.function + def dtype_gen(s): + s += "Q: What is the full name of DNS?\n" + s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n" + s += "Q: Which year was DNS invented?\n" + s += "A: " + sgl.gen("int_res", dtype=int) + "\n" + s += "Q: What is the value of pi?\n" + s += "A: " + sgl.gen("float_res", dtype=float) + "\n" + s += "Q: Is the sky blue?\n" + s += "A: " + sgl.gen("bool_res", dtype=bool) + "\n" + + state = dtype_gen.run() + + try: + state["int_res"] = int(state["int_res"]) + state["float_res"] = float(state["float_res"]) + state["bool_res"] = bool(state["bool_res"]) + # assert state["str_res"].startswith('"') and state["str_res"].endswith('"') + except ValueError: + print(state) + raise + + def test_completion_speculative(): @sgl.function(num_api_spec_tokens=64) def gen_character_spec(s): diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index b2a07ae36c..fcd86ae3d3 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -1,10 +1,10 @@ -import json import unittest import sglang as sgl from sglang.test.test_programs import ( test_decode_int, test_decode_json_regex, + test_dtype_gen, test_expert_answer, test_few_shot_qa, test_mt_bench, @@ -59,6 +59,9 @@ def test_stream(self): def test_regex(self): test_regex() + def test_dtype_gen(self): + test_dtype_gen() + if __name__ == "__main__": unittest.main() From 96a2093ef021b7fb10cf727050e0c87494c5463a Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Wed, 14 Aug 2024 10:37:01 -0700 Subject: [PATCH 027/118] [Fix] Compatibility of window attention and cuda graph (#1090) --- python/sglang/srt/layers/radix_attention.py | 16 ++++-- .../srt/model_executor/cuda_graph_runner.py | 55 +++++++++++++++---- .../srt/model_executor/forward_batch_info.py | 10 +--- .../sglang/srt/model_executor/model_runner.py | 22 +++----- python/sglang/srt/server_args.py | 4 +- .../test/{long_prompt => long_prompt.txt} | 0 python/sglang/test/runners.py | 2 +- 7 files changed, 70 insertions(+), 39 deletions(-) rename python/sglang/test/{long_prompt => long_prompt.txt} (100%) diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 49b86ad191..978a5d4c05 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -34,6 +34,7 @@ def __init__( scaling: float, num_kv_heads: int, layer_id: int, + reuse: bool = False, sliding_window_size: int = -1, logit_cap: int = -1, v_head_dim: int = -1, @@ -47,6 +48,7 @@ def __init__( self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim self.scaling = scaling self.layer_id = layer_id + self.reuse = reuse self.sliding_window_size = sliding_window_size if ( @@ -127,8 +129,9 @@ def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): if isinstance(prefill_wrapper_paged, list): prefill_wrapper_paged = prefill_wrapper_paged[1] - if not input_metadata.flashinfer_use_ragged: - self.store_kv_cache(k, v, input_metadata) + if not input_metadata.flashinfer_use_ragged or self.reuse: + if not self.reuse: + self.store_kv_cache(k, v, input_metadata) o = prefill_wrapper_paged.forward( q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), @@ -179,7 +182,8 @@ def decode_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): if isinstance(decode_wrapper, list): decode_wrapper = decode_wrapper[1] - self.store_kv_cache(k, v, input_metadata) + if not self.reuse: + self.store_kv_cache(k, v, input_metadata) o = decode_wrapper.forward( q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), @@ -191,8 +195,10 @@ def decode_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): return o.view(-1, self.tp_q_head_num * self.head_dim) def forward(self, q, k, v, input_metadata: InputMetadata): - k = k.view(-1, self.tp_k_head_num, self.qk_head_dim) - v = v.view(-1, self.tp_v_head_num, self.v_head_dim) + if k is not None: + assert v is not None + k = k.view(-1, self.tp_k_head_num, self.qk_head_dim) + v = v.view(-1, self.tp_v_head_num, self.v_head_dim) if input_metadata.forward_mode == ForwardMode.EXTEND: return self.extend_forward(q, k, v, input_metadata) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index a74e8eef78..ed26322c34 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -107,9 +107,6 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): ) # FlashInfer inputs - self.flashinfer_workspace_buffer = ( - self.model_runner.flashinfer_workspace_buffers[0] - ) self.flashinfer_kv_indptr = torch.zeros( (self.max_bs + 1,), dtype=torch.int32, device="cuda" ) @@ -121,6 +118,23 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): self.flashinfer_kv_last_page_len = torch.ones( (self.max_bs,), dtype=torch.int32, device="cuda" ) + if model_runner.sliding_window_size is None: + self.flashinfer_workspace_buffer = ( + self.model_runner.flashinfer_workspace_buffers[0] + ) + else: + self.flashinfer_workspace_buffers = [ + self.model_runner.flashinfer_workspace_buffers[0], + self.model_runner.flashinfer_workspace_buffers[2], + ] + self.flashinfer_kv_indptr = [ + self.flashinfer_kv_indptr, + self.flashinfer_kv_indptr.clone(), + ] + self.flashinfer_kv_indices = [ + self.flashinfer_kv_indices, + self.flashinfer_kv_indices.clone(), + ] self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else [] @@ -171,15 +185,32 @@ def capture_one_batch_size(self, bs, forward): use_tensor_cores = True else: use_tensor_cores = False - flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffer, - "NHD", - use_cuda_graph=True, - use_tensor_cores=use_tensor_cores, - paged_kv_indptr_buffer=self.flashinfer_kv_indptr[: bs + 1], - paged_kv_indices_buffer=self.flashinfer_kv_indices, - paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[:bs], - ) + if self.model_runner.sliding_window_size is None: + flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( + self.flashinfer_workspace_buffer, + "NHD", + use_cuda_graph=True, + use_tensor_cores=use_tensor_cores, + paged_kv_indptr_buffer=self.flashinfer_kv_indptr[: bs + 1], + paged_kv_indices_buffer=self.flashinfer_kv_indices, + paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[:bs], + ) + else: + flashinfer_decode_wrapper = [] + for i in range(2): + flashinfer_decode_wrapper.append( + BatchDecodeWithPagedKVCacheWrapper( + self.flashinfer_workspace_buffers[i], + "NHD", + use_cuda_graph=True, + use_tensor_cores=use_tensor_cores, + paged_kv_indptr_buffer=self.flashinfer_kv_indptr[i][: bs + 1], + paged_kv_indices_buffer=self.flashinfer_kv_indices[i], + paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[ + :bs + ], + ) + ) update_flashinfer_indices( ForwardMode.DECODE, self.model_runner, diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 3b2ee9de06..809b3329df 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -154,7 +154,6 @@ def from_schedule_batch( model_runner: "ModelRunner", batch: ScheduleBatch, forward_mode: ForwardMode, - sliding_window_size: Optional[int] = None, ): ret = cls( forward_mode=forward_mode, @@ -198,7 +197,7 @@ def from_schedule_batch( ): flashinfer_use_ragged = True ret.init_flashinfer_handlers( - model_runner, prefix_lens, flashinfer_use_ragged, sliding_window_size + model_runner, prefix_lens, flashinfer_use_ragged ) return ret @@ -221,7 +220,6 @@ def init_flashinfer_handlers( model_runner, prefix_lens, flashinfer_use_ragged, - sliding_window_size=None, ): update_flashinfer_indices( self.forward_mode, @@ -230,7 +228,6 @@ def init_flashinfer_handlers( self.seq_lens, prefix_lens, flashinfer_use_ragged=flashinfer_use_ragged, - sliding_window_size=sliding_window_size, ) ( @@ -254,7 +251,6 @@ def update_flashinfer_indices( prefix_lens, flashinfer_decode_wrapper=None, flashinfer_use_ragged=False, - sliding_window_size=None, ): """Init auxiliary variables for FlashInfer attention backend.""" num_qo_heads = model_runner.model_config.num_attention_heads // model_runner.tp_size @@ -262,7 +258,7 @@ def update_flashinfer_indices( head_dim = model_runner.model_config.head_dim batch_size = len(req_pool_indices) - if sliding_window_size is None: + if model_runner.sliding_window_size is None: if flashinfer_use_ragged: paged_kernel_lens = prefix_lens else: @@ -335,7 +331,7 @@ def update_flashinfer_indices( if wrapper_id == 0 and forward_mode == ForwardMode.DECODE: paged_kernel_lens = torch.minimum( - paged_kernel_lens, torch.tensor(sliding_window_size) + paged_kernel_lens, torch.tensor(model_runner.sliding_window_size) ) kv_start_idx = seq_lens - paged_kernel_lens else: diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 9da284da65..0a74834237 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -187,6 +187,11 @@ def load_model(self): scheduler_config=None, cache_config=None, ) + self.sliding_window_size = ( + self.model.get_window_size() + if hasattr(self.model, "get_window_size") + else None + ) self.is_generation = is_generation_model( self.model_config.hf_config.architectures ) @@ -295,12 +300,6 @@ def init_cublas(self): return c def init_flashinfer(self): - self.sliding_window_size = ( - self.model.get_window_size() - if hasattr(self.model, "get_window_size") - else None - ) - if self.server_args.disable_flashinfer: assert ( self.sliding_window_size is None @@ -339,7 +338,7 @@ def init_flashinfer(self): use_tensor_cores=use_tensor_cores, ) else: - workspace_buffers = torch.empty( + self.flashinfer_workspace_buffers = torch.empty( 4, global_config.flashinfer_workspace_size, dtype=torch.uint8, @@ -351,17 +350,17 @@ def init_flashinfer(self): for i in range(2): self.flashinfer_prefill_wrapper_ragged.append( BatchPrefillWithRaggedKVCacheWrapper( - workspace_buffers[2 * i + 0], "NHD" + self.flashinfer_workspace_buffers[2 * i + 0], "NHD" ) ) self.flashinfer_prefill_wrapper_paged.append( BatchPrefillWithPagedKVCacheWrapper( - workspace_buffers[2 * i + 1], "NHD" + self.flashinfer_workspace_buffers[2 * i + 1], "NHD" ) ) self.flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - workspace_buffers[2 * i + 0], + self.flashinfer_workspace_buffers[2 * i + 0], "NHD", use_tensor_cores=use_tensor_cores, ) @@ -404,7 +403,6 @@ def forward_decode(self, batch: ScheduleBatch): self, batch, ForwardMode.DECODE, - sliding_window_size=self.sliding_window_size, ) return self.model.forward( @@ -417,7 +415,6 @@ def forward_extend(self, batch: ScheduleBatch): self, batch, forward_mode=ForwardMode.EXTEND, - sliding_window_size=self.sliding_window_size, ) return self.model.forward( batch.input_ids, input_metadata.positions, input_metadata @@ -429,7 +426,6 @@ def forward_extend_multi_modal(self, batch: ScheduleBatch): self, batch, forward_mode=ForwardMode.EXTEND, - sliding_window_size=self.sliding_window_size, ) return self.model.forward( batch.input_ids, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 5e7996b801..8ed66960b2 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -453,10 +453,12 @@ def check_server_args(self): logger.info( f"When using sliding window in gemma-2, disable radix_cache, regex_jump_forward, and turn on flashinfer." ) + # FIXME: compatibility with radix attention self.disable_radix_cache = True + # FIXME: compatibility with jump forward self.disable_regex_jump_forward = True self.disable_flashinfer = False - self.disable_cuda_graph = True + # FIXME: compatibility with chunked prefill self.chunked_prefill_size = None diff --git a/python/sglang/test/long_prompt b/python/sglang/test/long_prompt.txt similarity index 100% rename from python/sglang/test/long_prompt rename to python/sglang/test/long_prompt.txt diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index c8357a16c6..e325ecb710 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -36,7 +36,7 @@ ] dirpath = os.path.dirname(__file__) -with open(os.path.join(dirpath, "long_prompt"), "r") as f: +with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f: long_prompt = f.read() DEFAULT_PROMPTS.append(long_prompt) From 1c2b5f524041752442856428db451510a75add96 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 15 Aug 2024 01:39:15 +0800 Subject: [PATCH 028/118] docs: update nsys usage (#1103) --- benchmark/latency_throughput/README.md | 36 +++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/benchmark/latency_throughput/README.md b/benchmark/latency_throughput/README.md index b6c2e67971..b1061793aa 100644 --- a/benchmark/latency_throughput/README.md +++ b/benchmark/latency_throughput/README.md @@ -33,8 +33,42 @@ python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama ``` ### Profile with Nsight +0. Prerequisite +```bash +# install nsys +# https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html +apt update +apt install -y --no-install-recommends gnupg +echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list +apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub +apt update +apt install nsight-systems-cli +``` + 1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512` -2. To profile a server, use `nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B`. + +2. To profile a server, e.g. + +```bash +# server +# set the delay and duration times according to needs +nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache + +# client +python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048 +``` + +3. Use NVTX, e.g. + +```bash +# install nvtx +pip install nvtx + +# code snippets +import nvtx +with nvtx.annotate("description", color="color"): + # some critical code +``` ## Other baselines From 73cf6834f2a6ee0d566a1ca70db5e2c05c76486b Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Wed, 14 Aug 2024 17:31:39 -0700 Subject: [PATCH 029/118] Support `stop_token_ids` in sglang API (#1092) --- python/sglang/api.py | 6 ++++++ python/sglang/lang/interpreter.py | 6 ++++-- python/sglang/lang/ir.py | 11 ++++++++++- python/sglang/srt/managers/schedule_batch.py | 10 ++++++---- python/sglang/test/test_programs.py | 11 +++++++---- test/srt/test_moe_serving_throughput.py | 2 +- 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/python/sglang/api.py b/python/sglang/api.py index 2242b4a4c6..887ffce76e 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -62,6 +62,7 @@ def gen( name: Optional[str] = None, max_tokens: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, @@ -98,6 +99,7 @@ def gen( name, max_tokens, stop, + stop_token_ids, temperature, top_p, top_k, @@ -117,6 +119,7 @@ def gen_int( name: Optional[str] = None, max_tokens: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, @@ -132,6 +135,7 @@ def gen_int( name, max_tokens, stop, + stop_token_ids, temperature, top_p, top_k, @@ -151,6 +155,7 @@ def gen_string( name: Optional[str] = None, max_tokens: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, @@ -166,6 +171,7 @@ def gen_string( name, max_tokens, stop, + stop_token_ids, temperature, top_p, top_k, diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index cf53fac303..844c9d062b 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -20,7 +20,6 @@ SglConstantText, SglExpr, SglExprList, - SglFunction, SglGen, SglImage, SglRoleBegin, @@ -181,8 +180,10 @@ def __init__( num_api_spec_tokens=None, use_thread=True, ): + from sglang.lang.backend.base_backend import BaseBackend + self.sid = uuid.uuid4().hex - self.backend = backend + self.backend: BaseBackend = backend self.arguments: Dict[str, Any] = arguments self.default_sampling_para = default_sampling_para self.stream = stream @@ -658,6 +659,7 @@ def _resolve_sampling_params(self, sampling_params): for item in [ "max_new_tokens", "stop", + "stop_token_ids", "temperature", "top_p", "top_k", diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 0166b86870..9db5f2719e 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -18,6 +18,7 @@ class SglSamplingParams: max_new_tokens: int = 128 stop: Union[str, List[str]] = () + stop_token_ids: Optional[List[int]] = () temperature: float = 1.0 top_p: float = 1.0 top_k: int = -1 # -1 means disable @@ -37,6 +38,7 @@ def clone(self): return SglSamplingParams( self.max_new_tokens, self.stop, + self.stop_token_ids, self.temperature, self.top_p, self.top_k, @@ -108,6 +110,7 @@ def to_srt_kwargs(self): return { "max_new_tokens": self.max_new_tokens, "stop": self.stop, + "stop_token_ids": self.stop_token_ids, "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, @@ -141,7 +144,8 @@ def run( self, *args, max_new_tokens: int = 128, - stop: Union[str, List[str]] = (), + stop: Union[str, List[str]] = [], + stop_token_ids: Optional[List[int]] = [], temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, @@ -161,6 +165,7 @@ def run( default_sampling_para = SglSamplingParams( max_new_tokens=max_new_tokens, stop=stop, + stop_token_ids=stop_token_ids, temperature=temperature, top_p=top_p, top_k=top_k, @@ -181,6 +186,7 @@ def run_batch( *, max_new_tokens: int = 128, stop: Union[str, List[str]] = (), + stop_token_ids: Optional[List[int]] = [], temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, @@ -218,6 +224,7 @@ def run_batch( default_sampling_para = SglSamplingParams( max_new_tokens=max_new_tokens, stop=stop, + stop_token_ids=stop_token_ids, temperature=temperature, top_p=top_p, top_k=top_k, @@ -397,6 +404,7 @@ def __init__( name: Optional[str] = None, max_new_tokens: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, + stop_token_ids: Optional[List[int]] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, @@ -416,6 +424,7 @@ def __init__( self.sampling_params = SglSamplingParams( max_new_tokens=max_new_tokens, stop=stop, + stop_token_ids=stop_token_ids, temperature=temperature, top_p=top_p, top_k=top_k, diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9037f5a6ea..9e86c9b188 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -235,10 +235,12 @@ def check_finished(self): return last_token_id = self.output_ids[-1] - if self.tokenizer is None: - matched_eos = last_token_id in self.sampling_params.stop_token_ids - else: - matched_eos = last_token_id == self.tokenizer.eos_token_id + + matched_eos = last_token_id in self.sampling_params.stop_token_ids + + if self.tokenizer is not None: + matched_eos |= last_token_id == self.tokenizer.eos_token_id + if matched_eos and not self.sampling_params.ignore_eos: self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id) return diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py index 6e39f0aa99..ce40255855 100644 --- a/python/sglang/test/test_programs.py +++ b/python/sglang/test/test_programs.py @@ -106,13 +106,16 @@ def decode_json(s): from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR s += "Generate a JSON object to describe the basic city information of Paris.\n" + s += "Here are the JSON object:\n" + + # NOTE: we recommend using dtype gen or whole regex string to control the output with s.var_scope("json_output"): s += "{\n" - s += ' "name": ' + sgl.gen(regex=REGEX_STR + ",") + "\n" - s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n" - s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n" - s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n" + s += ' "name": ' + sgl.gen(regex=REGEX_STR) + ",\n" + s += ' "population": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n" + s += ' "area": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n" + s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT, stop=[" ", "\n"]) + "\n" s += "}" ret = decode_json.run(temperature=0.0) diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 713eba7abb..80b445f490 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -84,7 +84,7 @@ def test_default_without_radix_cache(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] > 940 + assert res["output_throughput"] > 930 def test_default_with_chunked_prefill(self): res = self.run_test( From 6767e2229f6245a30fff0373ecceb1c13792d594 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Wed, 14 Aug 2024 17:43:14 -0700 Subject: [PATCH 030/118] Support jinja as chat template file (#1104) --- python/sglang/srt/openai_api/adapter.py | 53 +++++++++++++++---------- python/sglang/srt/server.py | 7 +--- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 8998cf39de..15aa701cb0 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -117,7 +117,7 @@ def create_streaming_error_response( return json_str -def load_chat_template_for_openai_api(chat_template_arg): +def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg): global chat_template_name print(f"Use chat template: {chat_template_arg}") @@ -127,27 +127,38 @@ def load_chat_template_for_openai_api(chat_template_arg): f"Chat template {chat_template_arg} is not a built-in template name " "or a valid chat template file path." ) - with open(chat_template_arg, "r") as filep: - template = json.load(filep) - try: - sep_style = SeparatorStyle[template["sep_style"]] - except KeyError: - raise ValueError( - f"Unknown separator style: {template['sep_style']}" - ) from None - register_conv_template( - Conversation( - name=template["name"], - system_template=template["system"] + "\n{system_message}", - system_message=template.get("system_message", ""), - roles=(template["user"], template["assistant"]), - sep_style=sep_style, - sep=template.get("sep", "\n"), - stop_str=template["stop_str"], - ), - override=True, + if chat_template_arg.endswith(".jinja"): + with open(chat_template_arg, "r") as f: + chat_template = "".join(f.readlines()).strip("\n") + tokenizer_manager.tokenizer.chat_template = chat_template.replace( + "\\n", "\n" ) - chat_template_name = template["name"] + chat_template_name = None + else: + assert chat_template_arg.endswith( + ".json" + ), "unrecognized format of chat template file" + with open(chat_template_arg, "r") as filep: + template = json.load(filep) + try: + sep_style = SeparatorStyle[template["sep_style"]] + except KeyError: + raise ValueError( + f"Unknown separator style: {template['sep_style']}" + ) from None + register_conv_template( + Conversation( + name=template["name"], + system_template=template["system"] + "\n{system_message}", + system_message=template.get("system_message", ""), + roles=(template["user"], template["assistant"]), + sep_style=sep_style, + sep=template.get("sep", "\n"), + stop_str=template["stop_str"], + ), + override=True, + ) + chat_template_name = template["name"] else: chat_template_name = chat_template_arg diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 8f735ac0c7..973f9c8e12 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -288,6 +288,8 @@ def launch_server( # Launch processes tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args) + if server_args.chat_template: + load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template) pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False) pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False) @@ -375,11 +377,6 @@ def _set_envs_and_config(server_args: ServerArgs): # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency. maybe_set_triton_cache_manager() - # Set global chat template - if server_args.chat_template: - # TODO: replace this with huggingface transformers template - load_chat_template_for_openai_api(server_args.chat_template) - # Check flashinfer version if not server_args.disable_flashinfer: assert_pkg_version( From 326df4bab25583eb1dcfaaf0f5f1f28b20d35ae7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 19:25:37 -0700 Subject: [PATCH 031/118] Use a single workspace for flashinfer (#1077) --- benchmark/gsm8k/bench_sglang.py | 2 +- python/sglang/global_config.py | 2 +- .../srt/model_executor/cuda_graph_runner.py | 12 ++++++------ python/sglang/srt/model_executor/model_runner.py | 16 +++++++--------- python/sglang/srt/server.py | 2 +- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index 652086f913..d9d4b0ab20 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -64,7 +64,7 @@ def main(args): @sgl.function def few_shot_gsm8k(s, question): s += few_shot_examples + question - s += sgl.gen("answer", max_tokens=512, stop="Question") + s += sgl.gen("answer", max_tokens=512, stop=["Question", "Assistant:"]) ##################################### ########## SGL Program End ########## diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py index b02ce9f81e..d5f16e2ae5 100644 --- a/python/sglang/global_config.py +++ b/python/sglang/global_config.py @@ -27,7 +27,7 @@ def __init__(self): # Runtime constants: others self.num_continue_decode_steps = 10 self.retract_decode_steps = 20 - self.flashinfer_workspace_size = 192 * 1024 * 1024 + self.flashinfer_workspace_size = 384 * 1024 * 1024 # Output tokenization configs self.skip_special_tokens_in_output = True diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index ed26322c34..3d4e5d4c6a 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -120,13 +120,13 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): ) if model_runner.sliding_window_size is None: self.flashinfer_workspace_buffer = ( - self.model_runner.flashinfer_workspace_buffers[0] + self.model_runner.flashinfer_workspace_buffer ) else: - self.flashinfer_workspace_buffers = [ - self.model_runner.flashinfer_workspace_buffers[0], - self.model_runner.flashinfer_workspace_buffers[2], - ] + self.flashinfer_workspace_buffer = ( + self.model_runner.flashinfer_workspace_buffer + ) + self.flashinfer_kv_indptr = [ self.flashinfer_kv_indptr, self.flashinfer_kv_indptr.clone(), @@ -200,7 +200,7 @@ def capture_one_batch_size(self, bs, forward): for i in range(2): flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[i], + self.flashinfer_workspace_buffer, "NHD", use_cuda_graph=True, use_tensor_cores=use_tensor_cores, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0a74834237..6826bf1a4e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -318,28 +318,26 @@ def init_flashinfer(self): use_tensor_cores = False if self.sliding_window_size is None: - self.flashinfer_workspace_buffers = torch.empty( - 2, + self.flashinfer_workspace_buffer = torch.empty( global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda", ) self.flashinfer_prefill_wrapper_ragged = ( BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[1], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], + self.flashinfer_workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores, ) else: self.flashinfer_workspace_buffers = torch.empty( - 4, global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda", @@ -350,17 +348,17 @@ def init_flashinfer(self): for i in range(2): self.flashinfer_prefill_wrapper_ragged.append( BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 0], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_prefill_wrapper_paged.append( BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 1], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 0], + self.flashinfer_workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores, ) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 973f9c8e12..ae886796c5 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -381,7 +381,7 @@ def _set_envs_and_config(server_args: ServerArgs): if not server_args.disable_flashinfer: assert_pkg_version( "flashinfer", - "0.1.4", + "0.1.5", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", From 8d2d876fc8ec690db8728d363c593174ee3b97c0 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Wed, 14 Aug 2024 21:56:01 -0700 Subject: [PATCH 032/118] [Fix] fix the typo bug for window attention (#1106) --- python/sglang/srt/model_executor/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 6826bf1a4e..675ca60d02 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -337,7 +337,7 @@ def init_flashinfer(self): use_tensor_cores=use_tensor_cores, ) else: - self.flashinfer_workspace_buffers = torch.empty( + self.flashinfer_workspace_buffer = torch.empty( global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda", From e86b1ccbf07d29ec040b5d1d4092f152237db0f8 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 21:56:20 -0700 Subject: [PATCH 033/118] Enable chunked prefill by default (#1040) --- .github/workflows/e2e-test.yml | 4 ++-- python/sglang/srt/server_args.py | 2 +- test/srt/test_moe_serving_throughput.py | 4 ++-- test/srt/test_serving_throughput.py | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index cb11e0db53..ad271c37ed 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -47,8 +47,8 @@ jobs: python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache timeout-minutes: 10 - - name: Benchmark Serving Throughput (w/ ChunkedPrefill) + - name: Benchmark Serving Throughput (w/o ChunkedPrefill) run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill timeout-minutes: 10 diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 8ed66960b2..6512e1b6ec 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -49,7 +49,7 @@ class ServerArgs: max_running_requests: Optional[int] = None max_num_reqs: Optional[int] = None max_total_tokens: Optional[int] = None - chunked_prefill_size: int = -1 + chunked_prefill_size: int = 8192 max_prefill_tokens: int = 16384 schedule_policy: str = "lpm" schedule_conservativeness: float = 1.0 diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 80b445f490..bbcd512276 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -86,11 +86,11 @@ def test_default_without_radix_cache(self): # A100 (PCIE) performance assert res["output_throughput"] > 930 - def test_default_with_chunked_prefill(self): + def test_default_without_chunked_prefill(self): res = self.run_test( disable_radix_cache=ServerArgs.disable_radix_cache, disable_flashinfer=ServerArgs.disable_flashinfer, - chunked_prefill_size=8192, + chunked_prefill_size=-1, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index c99d2e07e2..261ac6ec52 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -71,7 +71,7 @@ def test_default(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] >= 1400 + assert res["output_throughput"] > 1400 def test_default_without_radix_cache(self): res = self.run_test( @@ -82,18 +82,18 @@ def test_default_without_radix_cache(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] >= 1450 + assert res["output_throughput"] > 1450 - def test_default_with_chunked_prefill(self): + def test_default_without_chunked_prefill(self): res = self.run_test( disable_radix_cache=ServerArgs.disable_radix_cache, disable_flashinfer=ServerArgs.disable_flashinfer, - chunked_prefill_size=8192, + chunked_prefill_size=-1, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] >= 1400 + assert res["output_throughput"] > 1400 def test_all_cases(self): for disable_radix_cache in [False, True]: From 14cb544d56b06b25483c4cf9c817b657acff8604 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Thu, 15 Aug 2024 00:53:24 -0700 Subject: [PATCH 034/118] [Fix] fix flashinfer usage for window attention (#1107) --- python/sglang/srt/layers/radix_attention.py | 5 +---- .../srt/model_executor/forward_batch_info.py | 14 ++++++-------- python/sglang/srt/model_executor/model_runner.py | 11 +++++------ 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 978a5d4c05..a7474326fc 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -120,12 +120,9 @@ def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): # using two wrappers is unnecessary in the current PR, but are prepared for future PRs prefill_wrapper_ragged = input_metadata.flashinfer_prefill_wrapper_ragged prefill_wrapper_paged = input_metadata.flashinfer_prefill_wrapper_paged - if self.sliding_window_size != -1: - prefill_wrapper_ragged = prefill_wrapper_ragged[0] + if self.sliding_window_size != -1 or self.reuse: prefill_wrapper_paged = prefill_wrapper_paged[0] else: - if isinstance(prefill_wrapper_ragged, list): - prefill_wrapper_ragged = prefill_wrapper_ragged[1] if isinstance(prefill_wrapper_paged, list): prefill_wrapper_paged = prefill_wrapper_paged[1] diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 809b3329df..66479b255b 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -324,9 +324,11 @@ def update_flashinfer_indices( else: kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") for wrapper_id in range(2): - if flashinfer_use_ragged: + if flashinfer_use_ragged and wrapper_id == 1: + # full attention use ragged+paged paged_kernel_lens = prefix_lens else: + # window attention use paged only paged_kernel_lens = seq_lens if wrapper_id == 0 and forward_mode == ForwardMode.DECODE: @@ -374,13 +376,9 @@ def update_flashinfer_indices( ) qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0) - if flashinfer_use_ragged: - model_runner.flashinfer_prefill_wrapper_ragged[ - wrapper_id - ].end_forward() - model_runner.flashinfer_prefill_wrapper_ragged[ - wrapper_id - ].begin_forward( + if flashinfer_use_ragged and wrapper_id == 1: + model_runner.flashinfer_prefill_wrapper_ragged.end_forward() + model_runner.flashinfer_prefill_wrapper_ragged.begin_forward( qo_indptr, qo_indptr, num_qo_heads, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 675ca60d02..748069fc21 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -342,15 +342,14 @@ def init_flashinfer(self): dtype=torch.uint8, device="cuda", ) - self.flashinfer_prefill_wrapper_ragged = [] + self.flashinfer_prefill_wrapper_ragged = ( + BatchPrefillWithRaggedKVCacheWrapper( + self.flashinfer_workspace_buffer, "NHD" + ) + ) self.flashinfer_prefill_wrapper_paged = [] self.flashinfer_decode_wrapper = [] for i in range(2): - self.flashinfer_prefill_wrapper_ragged.append( - BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffer, "NHD" - ) - ) self.flashinfer_prefill_wrapper_paged.append( BatchPrefillWithPagedKVCacheWrapper( self.flashinfer_workspace_buffer, "NHD" From 9195d1362aa33db052c01fb9589301299d6fc50c Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 15 Aug 2024 23:29:35 +0800 Subject: [PATCH 035/118] misc: rm unused model_loader (#1110) --- .../sglang/srt/model_executor/model_runner.py | 10 +- .../sglang/srt/model_loader/model_loader.py | 292 ------------------ python/sglang/srt/model_loader/utils.py | 275 ----------------- 3 files changed, 1 insertion(+), 576 deletions(-) delete mode 100644 python/sglang/srt/model_loader/model_loader.py delete mode 100644 python/sglang/srt/model_loader/utils.py diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 748069fc21..d3ed96fe02 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -38,6 +38,7 @@ init_distributed_environment, initialize_model_parallel, ) +from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import ModelRegistry from sglang.global_config import global_config @@ -168,15 +169,6 @@ def load_model(self): if self.model_config.model_overide_args is not None: vllm_model_config.hf_config.update(self.model_config.model_overide_args) - if ( - self.server_args.efficient_weight_load - and "llama" in self.server_args.model_path.lower() - and self.server_args.quantization == "fp8" - ): - from sglang.srt.model_loader.model_loader import get_model - else: - from vllm.model_executor.model_loader import get_model - self.model = get_model( model_config=vllm_model_config, device_config=device_config, diff --git a/python/sglang/srt/model_loader/model_loader.py b/python/sglang/srt/model_loader/model_loader.py deleted file mode 100644 index 4b7e32b6e5..0000000000 --- a/python/sglang/srt/model_loader/model_loader.py +++ /dev/null @@ -1,292 +0,0 @@ -""" -Copyright 2023-2024 SGLang Team -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -# temporarily adapted from https://github.com/vllm-project/vllm/blob/10383887e03412196a2689b9398290719c4797bf/vllm/model_executor/model_loader/loader.py -# FIXME: in progress of refactoring the model loader - -import glob -import os -import re -from typing import Any, Dict, Generator, List, Optional, Tuple, Type - -import torch -from torch import nn -from tqdm import tqdm -from vllm.config import ( - CacheConfig, - DeviceConfig, - LoadConfig, - LoadFormat, - LoRAConfig, - ModelConfig, - MultiModalConfig, - ParallelConfig, - SchedulerConfig, -) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -from vllm.model_executor.model_loader.utils import ( - get_model_architecture, - set_default_torch_dtype, -) -from vllm.platforms import current_platform - -from sglang.srt.model_loader.utils import ( - download_safetensors_index_file_from_hf, - download_weights_from_hf, - filter_duplicate_safetensors_files, - get_quant_config, - safetensors_weights_iterator, -) - - -def _get_quantization_config( - model_config: ModelConfig, load_config: LoadConfig -) -> Optional[QuantizationConfig]: - """Get the quantization config.""" - if model_config.quantization is not None: - quant_config = get_quant_config(model_config, load_config) - capability = current_platform.get_device_capability() - capability = capability[0] * 10 + capability[1] - if capability < quant_config.get_min_capability(): - raise ValueError( - f"The quantization method {model_config.quantization} is not " - "supported for the current GPU. " - f"Minimum capability: {quant_config.get_min_capability()}. " - f"Current capability: {capability}." - ) - supported_dtypes = quant_config.get_supported_act_dtypes() - if model_config.dtype not in supported_dtypes: - raise ValueError( - f"{model_config.dtype} is not supported for quantization " - f"method {model_config.quantization}. Supported dtypes: " - f"{supported_dtypes}" - ) - return quant_config - return None - - -def _get_model_initialization_kwargs( - model_class: Type[nn.Module], - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], -) -> Dict[str, Any]: - """Get extra kwargs for model initialization.""" - extra_kwargs: Dict[str, Any] = {} - - assert lora_config is None - assert multimodal_config is None - - return extra_kwargs - - -def _initialize_model( - model_config: ModelConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], - cache_config: CacheConfig, -) -> nn.Module: - """Initialize a model with the given configurations.""" - model_class = get_model_architecture(model_config)[0] - quant_config = _get_quantization_config(model_config, load_config) - - return model_class( - config=model_config.hf_config, - cache_config=cache_config, - quant_config=quant_config, - efficient_weight_load=True, - **_get_model_initialization_kwargs(model_class, lora_config, multimodal_config), - ) - - -class ModelLoader: - """Model loader that can load different file types from disk.""" - - def __init__(self, load_config: LoadConfig): - self.load_config = load_config - - def _prepare_weights( - self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool - ) -> Tuple[str, List[str], bool]: - """Prepare weights for the model. - - If the model is not local, it will be downloaded.""" - - is_local = os.path.isdir(model_name_or_path) - load_format = self.load_config.load_format - use_safetensors = False - # Some quantized models use .pt files for storing the weights. - if load_format == LoadFormat.AUTO: - allow_patterns = ["*.safetensors", "*.bin"] - elif load_format == LoadFormat.SAFETENSORS: - use_safetensors = True - allow_patterns = ["*.safetensors"] - elif load_format == LoadFormat.PT: - allow_patterns = ["*.pt"] - elif load_format == LoadFormat.NPCACHE: - allow_patterns = ["*.bin"] - else: - raise ValueError(f"Unknown load_format: {load_format}") - - if fall_back_to_pt: - allow_patterns += ["*.pt"] - - if not is_local: - hf_folder = download_weights_from_hf( - model_name_or_path, - self.load_config.download_dir, - allow_patterns, - revision, - ) - else: - hf_folder = model_name_or_path - - hf_weights_files: List[str] = [] - for pattern in allow_patterns: - hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) - if len(hf_weights_files) > 0: - if pattern == "*.safetensors": - use_safetensors = True - break - - if use_safetensors: - # For models like Mistral-7B-Instruct-v0.3 - # there are both sharded safetensors files and a consolidated - # safetensors file. Using both breaks. - # Here, we download the `model.safetensors.index.json` and filter - # any files not found in the index. - if not is_local: - download_safetensors_index_file_from_hf( - model_name_or_path, self.load_config.download_dir, revision - ) - hf_weights_files = filter_duplicate_safetensors_files( - hf_weights_files, hf_folder - ) - else: - hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files) - - if len(hf_weights_files) == 0: - raise RuntimeError( - f"Cannot find any model weights with `{model_name_or_path}`" - ) - - return hf_folder, hf_weights_files, use_safetensors - - def _get_weights_iterator( - self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool - ) -> Generator[Tuple[str, torch.Tensor], None, None]: - """Get an iterator for the model weights based on the load format.""" - hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( - model_name_or_path, revision, fall_back_to_pt - ) - if self.load_config.load_format == LoadFormat.NPCACHE: - # Currently np_cache only support *.bin checkpoints - assert use_safetensors is False - weights_iterator = np_cache_weights_iterator( - model_name_or_path, - self.load_config.download_dir, - hf_folder, - hf_weights_files, - ) - elif use_safetensors: - weights_iterator = safetensors_weights_iterator(hf_weights_files) - else: - weights_iterator = pt_weights_iterator(hf_weights_files) - - return weights_iterator - - def load_model( - self, - *, - model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, - ) -> nn.Module: - with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): - model = _initialize_model( - model_config, - self.load_config, - lora_config, - multimodal_config, - cache_config, - ) - weights = self._get_weights_iterator( - model_config.model, - model_config.revision, - fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True), - ) - - modules = {} - for name, module in model.named_modules(): - modules[name] = module - - def apply_quant_method(module): - quant_method = getattr(module, "quant_method", None) - if quant_method is not None: - # print("before apply quant", module.weight, module.weight.dtype) - quant_method.process_weights_after_loading(module) - # print("after apply quant", module.weight, module.weight.dtype) - # FIXME: Remove this after Mixtral is updated - # to use quant_method. - if hasattr(module, "process_weights_after_loading"): - module.process_weights_after_loading() - - if torch.cuda.current_device() == 0: - weights = tqdm( - weights, total=model.get_num_params() * 1.5, desc="load model" - ) - - num_shard = {} - num_loaded = {} - for name, loaded_weight in weights: - model.load_weights(None, name, loaded_weight) - module_name, shard_num = model.get_module_name(name) - num_shard[module_name] = shard_num - if module_name not in num_loaded: - num_loaded[module_name] = 1 - else: - num_loaded[module_name] += 1 - if num_loaded[module_name] == num_shard[module_name]: - apply_quant_method(modules[module_name]) - - return model.eval() - - -def get_model( - *, - model_config: ModelConfig, - load_config: LoadConfig, - device_config: DeviceConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], - cache_config: CacheConfig, -) -> nn.Module: - loader = ModelLoader(load_config) - return loader.load_model( - model_config=model_config, - device_config=device_config, - lora_config=lora_config, - multimodal_config=multimodal_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - cache_config=cache_config, - ) diff --git a/python/sglang/srt/model_loader/utils.py b/python/sglang/srt/model_loader/utils.py deleted file mode 100644 index 9d6520e2ae..0000000000 --- a/python/sglang/srt/model_loader/utils.py +++ /dev/null @@ -1,275 +0,0 @@ -""" -Copyright 2023-2024 SGLang Team -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -# temporarily adapted from vLLM -# FIXME: in progress of refactoring the model loader -"""Utilities for selecting and loading models.""" -import contextlib -import fnmatch -import hashlib -import json -import logging -import os -import tempfile -from typing import Any, Generator, Iterable, List, Optional, Tuple, Type - -import filelock -import huggingface_hub.constants -import torch -from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download -from safetensors.torch import load_file, safe_open, save_file -from torch import nn -from tqdm.auto import tqdm -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig - -from sglang.srt.layers.quantization import get_quantization_config - -logger = logging.getLogger(__name__) -temp_dir = tempfile.gettempdir() - - -@contextlib.contextmanager -def set_default_torch_dtype(dtype: torch.dtype): - """Sets the default torch dtype to the given dtype.""" - old_dtype = torch.get_default_dtype() - torch.set_default_dtype(dtype) - yield - torch.set_default_dtype(old_dtype) - - -def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: - architectures = getattr(model_config.hf_config, "architectures", []) - # Special handling for quantized Mixtral. - # FIXME(woosuk): This is a temporary hack. - if ( - model_config.quantization is not None - and model_config.quantization != "fp8" - and "MixtralForCausalLM" in architectures - ): - architectures = ["QuantMixtralForCausalLM"] - - for arch in architectures: - model_cls = ModelRegistry.load_model_cls(arch) - if model_cls is not None: - return (model_cls, arch) - raise ValueError( - f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {ModelRegistry.get_supported_archs()}" - ) - - -class DisabledTqdm(tqdm): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, disable=True) - - -def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): - lock_dir = cache_dir or temp_dir - os.makedirs(os.path.dirname(lock_dir), exist_ok=True) - model_name = model_name_or_path.replace("/", "-") - hash_name = hashlib.sha256(model_name.encode()).hexdigest() - # add hash to avoid conflict with old users' lock files - lock_file_name = hash_name + model_name + ".lock" - # mode 0o666 is required for the filelock to be shared across users - lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666) - return lock - - -def download_weights_from_hf( - model_name_or_path: str, - cache_dir: Optional[str], - allow_patterns: List[str], - revision: Optional[str] = None, -) -> str: - """Download model weights from Hugging Face Hub. - - Args: - model_name_or_path (str): The model name or path. - cache_dir (Optional[str]): The cache directory to store the model - weights. If None, will use HF defaults. - allow_patterns (List[str]): The allowed patterns for the - weight files. Files matched by any of the patterns will be - downloaded. - revision (Optional[str]): The revision of the model. - - Returns: - str: The path to the downloaded model weights. - """ - if not huggingface_hub.constants.HF_HUB_OFFLINE: - # Before we download we look at that is available: - fs = HfFileSystem() - file_list = fs.ls(model_name_or_path, detail=False, revision=revision) - - # depending on what is available we download different things - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] - break - - logger.info("Using model weights format %s", allow_patterns) - # Use file lock to prevent multiple processes from - # downloading the same model weights at the same time. - with get_lock(model_name_or_path, cache_dir): - hf_folder = snapshot_download( - model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - tqdm_class=DisabledTqdm, - revision=revision, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - ) - return hf_folder - - -def download_safetensors_index_file_from_hf( - model_name_or_path: str, - cache_dir: Optional[str], - revision: Optional[str] = None, -) -> None: - """Download hf safetensors index file from Hugging Face Hub. - - Args: - model_name_or_path (str): The model name or path. - cache_dir (Optional[str]): The cache directory to store the model - weights. If None, will use HF defaults. - revision (Optional[str]): The revision of the model. - """ - # Use file lock to prevent multiple processes from - # downloading the same model weights at the same time. - with get_lock(model_name_or_path, cache_dir): - try: - # Download the safetensors index file. - hf_hub_download( - repo_id=model_name_or_path, - filename=SAFE_WEIGHTS_INDEX_NAME, - cache_dir=cache_dir, - revision=revision, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - ) - # If file not found on remote or locally, we should not fail since - # only some models will have SAFE_WEIGHTS_INDEX_NAME. - except huggingface_hub.utils.EntryNotFoundError: - logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME) - except huggingface_hub.utils.LocalEntryNotFoundError: - logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME) - - -# For models like Mistral-7B-v0.3, there are both sharded -# safetensors files and a consolidated safetensors file. -# Passing both of these to the weight loader functionality breaks. -# So, we use the SAFE_WEIGHTS_INDEX_NAME to -# look up which safetensors files should be used. -def filter_duplicate_safetensors_files( - hf_weights_files: List[str], hf_folder: str -) -> List[str]: - # model.safetensors.index.json is a mapping from keys in the - # torch state_dict to safetensors file holding that weight. - index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME) - if not os.path.isfile(index_file_name): - return hf_weights_files - - # Iterate through the weight_map (weight_name: safetensors files) - # to identify weights that we should use. - with open(index_file_name) as index_file: - weight_map = json.load(index_file)["weight_map"] - weight_files_in_index = set() - for weight_name in weight_map: - weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name])) - # Filter out any fields that are not found in the index file. - hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index] - return hf_weights_files - - -def safetensors_weights_iterator( - hf_weights_files: List[str], -) -> Generator[Tuple[str, torch.Tensor], None, None]: - """Iterate over the weights in the model safetensor files.""" - for st_file in hf_weights_files: - with safe_open(st_file, framework="pt") as f: - for name in f.keys(): # noqa: SIM118 - param = f.get_tensor(name) - yield name, param - - -def get_quant_config( - model_config: ModelConfig, load_config: LoadConfig -) -> QuantizationConfig: - quant_cls = get_quantization_config(model_config.quantization) - # Read the quantization config from the HF model config, if available. - hf_quant_config = getattr(model_config.hf_config, "quantization_config", None) - if hf_quant_config is None: - # compressed-tensors uses a compressions_config - hf_quant_config = getattr(model_config.hf_config, "compression_config", None) - if hf_quant_config is not None: - return quant_cls.from_config(hf_quant_config) - # In case of bitsandbytes/QLoRA, get quant config from the adapter model. - if model_config.quantization == "bitsandbytes": - if ( - not load_config.model_loader_extra_config - or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config - ): - return quant_cls.from_config({"adapter_name_or_path": ""}) - model_name_or_path = load_config.model_loader_extra_config[ - "qlora_adapter_name_or_path" - ] - - else: - model_name_or_path = model_config.model - is_local = os.path.isdir(model_name_or_path) - if not is_local: - # Download the config files. - with get_lock(model_name_or_path, load_config.download_dir): - hf_folder = snapshot_download( - model_name_or_path, - revision=model_config.revision, - allow_patterns="*.json", - cache_dir=load_config.download_dir, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - tqdm_class=DisabledTqdm, - ) - else: - hf_folder = model_name_or_path - - possible_config_filenames = quant_cls.get_config_filenames() - - # If the quantization config is not found, use the default config. - if not possible_config_filenames: - return quant_cls() - - config_files = glob.glob(os.path.join(hf_folder, "*.json")) - - quant_config_files = [ - f for f in config_files if any(f.endswith(x) for x in possible_config_filenames) - ] - if len(quant_config_files) == 0: - raise ValueError(f"Cannot find the config file for {model_config.quantization}") - if len(quant_config_files) > 1: - raise ValueError( - f"Found multiple config files for {model_config.quantization}: " - f"{quant_config_files}" - ) - - quant_config_file = quant_config_files[0] - with open(quant_config_file, "r") as f: - config = json.load(f) - - if model_config.quantization == "bitsandbytes": - config["adapter_name_or_path"] = model_name_or_path - - return quant_cls.from_config(config) From 93d4e354d82b95663f52f3d031f8e432ad0c1803 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Thu, 15 Aug 2024 10:33:20 -0700 Subject: [PATCH 036/118] [Fix] Window attention compatible with RadixAttention and chunked prefill (#1112) --- python/sglang/srt/layers/radix_attention.py | 38 +++++++++---------- .../srt/model_executor/forward_batch_info.py | 37 ++++++++---------- .../sglang/srt/model_executor/model_runner.py | 6 +-- python/sglang/srt/models/gemma2.py | 2 +- python/sglang/srt/server_args.py | 10 +---- 5 files changed, 37 insertions(+), 56 deletions(-) diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index a7474326fc..a02673dc37 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -15,6 +15,8 @@ """Radix attention.""" +from typing import Optional + import torch from flashinfer.cascade import merge_state from torch import nn @@ -34,8 +36,7 @@ def __init__( scaling: float, num_kv_heads: int, layer_id: int, - reuse: bool = False, - sliding_window_size: int = -1, + sliding_window_size: Optional[int] = None, logit_cap: int = -1, v_head_dim: int = -1, ): @@ -48,8 +49,7 @@ def __init__( self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim self.scaling = scaling self.layer_id = layer_id - self.reuse = reuse - self.sliding_window_size = sliding_window_size + self.sliding_window_size = sliding_window_size if sliding_window_size else -1 if ( not global_server_args_dict.get("disable_flashinfer", False) @@ -118,16 +118,16 @@ def decode_forward_triton(self, q, k, v, input_metadata: InputMetadata): def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): # using two wrappers is unnecessary in the current PR, but are prepared for future PRs - prefill_wrapper_ragged = input_metadata.flashinfer_prefill_wrapper_ragged prefill_wrapper_paged = input_metadata.flashinfer_prefill_wrapper_paged - if self.sliding_window_size != -1 or self.reuse: + if self.sliding_window_size != -1: prefill_wrapper_paged = prefill_wrapper_paged[0] else: if isinstance(prefill_wrapper_paged, list): prefill_wrapper_paged = prefill_wrapper_paged[1] - if not input_metadata.flashinfer_use_ragged or self.reuse: - if not self.reuse: + if not input_metadata.flashinfer_use_ragged: + if k is not None: + assert v is not None self.store_kv_cache(k, v, input_metadata) o = prefill_wrapper_paged.forward( @@ -139,21 +139,20 @@ def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): logits_soft_cap=self.logit_cap, ) else: - o1, s1 = prefill_wrapper_ragged.forward_return_lse( - q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), - k.contiguous().view(-1, self.tp_k_head_num, self.head_dim), - v.contiguous().view(-1, self.tp_v_head_num, self.head_dim), - causal=True, - sm_scale=self.scaling, - window_left=self.sliding_window_size, - logits_soft_cap=self.logit_cap, + o1, s1 = ( + input_metadata.flashinfer_prefill_wrapper_ragged.forward_return_lse( + q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), + k.contiguous().view(-1, self.tp_k_head_num, self.head_dim), + v.contiguous().view(-1, self.tp_v_head_num, self.head_dim), + causal=True, + sm_scale=self.scaling, + logits_soft_cap=self.logit_cap, + ) ) if input_metadata.extend_no_prefix: o = o1 else: - # TODO window attention + radix attention will come up in next PR - assert self.sliding_window_size == -1 o2, s2 = prefill_wrapper_paged.forward_return_lse( q.contiguous().view(-1, self.tp_q_head_num, self.head_dim), input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id), @@ -179,7 +178,8 @@ def decode_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata): if isinstance(decode_wrapper, list): decode_wrapper = decode_wrapper[1] - if not self.reuse: + if k is not None: + assert v is not None self.store_kv_cache(k, v, input_metadata) o = decode_wrapper.forward( diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 66479b255b..ce5ea25eaa 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -194,6 +194,7 @@ def from_schedule_batch( if ( forward_mode != ForwardMode.DECODE and int(torch.sum(ret.seq_lens)) > 4096 + and model_runner.sliding_window_size is None ): flashinfer_use_ragged = True ret.init_flashinfer_handlers( @@ -322,22 +323,25 @@ def update_flashinfer_indices( 1, ) else: + # window attention use paged only kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") for wrapper_id in range(2): - if flashinfer_use_ragged and wrapper_id == 1: - # full attention use ragged+paged - paged_kernel_lens = prefix_lens + if wrapper_id == 0: + if forward_mode == ForwardMode.DECODE: + paged_kernel_lens = torch.minimum( + seq_lens, torch.tensor(model_runner.sliding_window_size + 1) + ) + else: + paged_kernel_lens = torch.minimum( + seq_lens, + torch.tensor(model_runner.sliding_window_size) + + seq_lens + - prefix_lens, + ) else: - # window attention use paged only paged_kernel_lens = seq_lens - if wrapper_id == 0 and forward_mode == ForwardMode.DECODE: - paged_kernel_lens = torch.minimum( - paged_kernel_lens, torch.tensor(model_runner.sliding_window_size) - ) - kv_start_idx = seq_lens - paged_kernel_lens - else: - kv_start_idx = torch.zeros(batch_size, dtype=torch.int32, device="cuda") + kv_start_idx = seq_lens - paged_kernel_lens kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda") kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0) @@ -376,17 +380,6 @@ def update_flashinfer_indices( ) qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0) - if flashinfer_use_ragged and wrapper_id == 1: - model_runner.flashinfer_prefill_wrapper_ragged.end_forward() - model_runner.flashinfer_prefill_wrapper_ragged.begin_forward( - qo_indptr, - qo_indptr, - num_qo_heads, - num_kv_heads, - head_dim, - ) - - # cached part model_runner.flashinfer_prefill_wrapper_paged[wrapper_id].end_forward() model_runner.flashinfer_prefill_wrapper_paged[wrapper_id].begin_forward( qo_indptr, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index d3ed96fe02..7af4ec2ddd 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -334,11 +334,7 @@ def init_flashinfer(self): dtype=torch.uint8, device="cuda", ) - self.flashinfer_prefill_wrapper_ragged = ( - BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffer, "NHD" - ) - ) + self.flashinfer_prefill_wrapper_ragged = None self.flashinfer_prefill_wrapper_paged = [] self.flashinfer_decode_wrapper = [] for i in range(2): diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 463d5e5054..80b99742e3 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -213,7 +213,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, layer_id=layer_idx, - sliding_window_size=get_window_size(config) if use_sliding_window else -1, + sliding_window_size=get_window_size(config) if use_sliding_window else None, logit_cap=self.config.attn_logit_softcapping, ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 6512e1b6ec..738ab7d1ab 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -450,16 +450,8 @@ def check_server_args(self): self.dp_size > 1 and self.node_rank is not None ), "multi-node data parallel is not supported" if "gemma-2" in self.model_path.lower(): - logger.info( - f"When using sliding window in gemma-2, disable radix_cache, regex_jump_forward, and turn on flashinfer." - ) - # FIXME: compatibility with radix attention - self.disable_radix_cache = True - # FIXME: compatibility with jump forward - self.disable_regex_jump_forward = True + logger.info(f"When using sliding window in gemma-2, turn on flashinfer.") self.disable_flashinfer = False - # FIXME: compatibility with chunked prefill - self.chunked_prefill_size = None @dataclasses.dataclass From 0cb099e20a0b9ccd308fff5ef133a2e4b26a7f7a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 15 Aug 2024 10:47:39 -0700 Subject: [PATCH 037/118] set CUDA_DEVICE_MAX_CONNECTIONS=1 (#1113) --- python/sglang/srt/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index ae886796c5..4f06f76300 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -360,6 +360,7 @@ def _set_envs_and_config(server_args: ServerArgs): os.environ["NCCL_CUMEM_ENABLE"] = "0" os.environ["NCCL_NVLS_ENABLE"] = "0" os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" # Set ulimit set_ulimit() From 5bd953749b520070a5b72b5b99b9a92853698685 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 16 Aug 2024 01:50:43 +0800 Subject: [PATCH 038/118] chore: bump v0.2.13 (#1111) --- README.md | 2 +- python/pyproject.toml | 2 +- python/sglang/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 451e0a6934..5434bb25f6 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ### Method 2: From source ``` # Use the last release branch -git clone -b v0.2.12 https://github.com/sgl-project/sglang.git +git clone -b v0.2.13 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip diff --git a/python/pyproject.toml b/python/pyproject.toml index 22a906e8aa..7ba4b4c6bd 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.2.12" +version = "0.2.13" description = "SGLang is yet another fast serving framework for large language models and vision language models." readme = "README.md" requires-python = ">=3.8" diff --git a/python/sglang/version.py b/python/sglang/version.py index b5c9b6cb71..11ef092868 100644 --- a/python/sglang/version.py +++ b/python/sglang/version.py @@ -1 +1 @@ -__version__ = "0.2.12" +__version__ = "0.2.13" From 87a0db82b867d52e775b96e344b0e16ff60cdb67 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 15 Aug 2024 10:54:24 -0700 Subject: [PATCH 039/118] update hyperparameter guide (#1114) --- docs/en/hyperparameter_tuning.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/en/hyperparameter_tuning.md b/docs/en/hyperparameter_tuning.md index 53b92435c7..02a0657c3f 100644 --- a/docs/en/hyperparameter_tuning.md +++ b/docs/en/hyperparameter_tuning.md @@ -10,7 +10,8 @@ When the server is running at full load, look for the following in the log: ### Tune Your Request Submission Speed `#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed. -A healthy range for `#queue-req` is `100 - 1000`. +A healthy range for `#queue-req` is `50 - 1000`. +On the other hand, do not make `#queue-req` too large because it will also increase the scheduling overhead on the server. ### Tune `--schedule-conservativeness` `token usage` indicates the KV cache memory utilization of the server. `token usage > 0.9` means good utilization. @@ -19,13 +20,14 @@ The case of serving being too conservative can happen when users send many reque On the other hand, if you see `token usage` very high and you frequently see warnings like `decode out of memory happened, #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3. +If you see `decode out of memory happened` occasionally but not frequently, it is okay. ### Tune `--dp-size` and `--tp-size` Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput. -### (Minor) Tune `--max-prefill-tokens`, `--mem-fraction-static`, `--max-running-requests` +### Avoid out-of-memory by tuning `--chunked-prefill-size`, `--mem-fraction-static`, `--max-running-requests` If you see out of memory (OOM) errors, you can decrease these parameters. -If OOM happens during prefill, try to decrease `--max-prefill-tokens`. +If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. If OOM happens during decoding, try to decrease `--max-running-requests`. You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. From 26e9c12c159277684078d70724247b16611d9e08 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 16 Aug 2024 02:26:44 +0800 Subject: [PATCH 040/118] ci: compatible with fork repo (#1115) --- .github/workflows/cancel-pr-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cancel-pr-workflow.yml b/.github/workflows/cancel-pr-workflow.yml index d4709dc463..535884ba60 100644 --- a/.github/workflows/cancel-pr-workflow.yml +++ b/.github/workflows/cancel-pr-workflow.yml @@ -1,7 +1,7 @@ name: Cancel PR Workflows on Merge on: - pull_request: + pull_request_target: types: - closed From 6aa8ad14f8a9b09904c11413449b9b5d942a115a Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 16 Aug 2024 13:46:43 +0800 Subject: [PATCH 041/118] fix: resolve Python.h header missing (#1119) --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 2f7541c9a4..42656ca264 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -8,7 +8,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && apt update -y \ && apt install software-properties-common -y \ && add-apt-repository ppa:deadsnakes/ppa -y && apt update \ - && apt install python3.10 -y \ + && apt install python3.10 python3.10-dev -y \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \ && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \ && apt install curl git sudo -y \ From 5a261bd0552c049f7eb14dfd20a1ae43f61c9f46 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 16 Aug 2024 01:39:24 -0700 Subject: [PATCH 042/118] Fix the deadlock in multi-node tp (#1122) --- benchmark/gsm8k/bench_sglang.py | 4 +++- python/sglang/srt/layers/logits_processor.py | 10 ++++++---- python/sglang/srt/managers/schedule_batch.py | 14 +++++++++++++- python/sglang/srt/managers/tp_worker.py | 16 ++++++++++------ .../srt/model_executor/cuda_graph_runner.py | 15 ++++++++++++++- python/sglang/srt/model_executor/model_runner.py | 6 +++++- python/sglang/srt/models/grok.py | 5 +++-- 7 files changed, 54 insertions(+), 16 deletions(-) diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index d9d4b0ab20..d32790fe0c 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -64,7 +64,9 @@ def main(args): @sgl.function def few_shot_gsm8k(s, question): s += few_shot_examples + question - s += sgl.gen("answer", max_tokens=512, stop=["Question", "Assistant:"]) + s += sgl.gen( + "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] + ) ##################################### ########## SGL Program End ########## diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index 541fa0f153..2e0ce6d5c7 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -67,10 +67,12 @@ def from_input_metadata(cls, input_metadata: InputMetadata): class LogitsProcessor(nn.Module): - def __init__(self, config): + def __init__(self, config, skip_all_gather: bool = False): super().__init__() self.config = config - self.tp_size = get_tensor_model_parallel_world_size() + self.do_tensor_parallel_all_gather = ( + not skip_all_gather and get_tensor_model_parallel_world_size() > 1 + ) def _get_normalized_prompt_logprobs( self, input_token_logprobs, logits_metadata: LogitsMetadata @@ -159,7 +161,7 @@ def forward( last_hidden = hidden_states[last_index] last_logits = torch.matmul(last_hidden, weight.T) - if self.tp_size > 1: + if self.do_tensor_parallel_all_gather: last_logits = tensor_model_parallel_all_gather(last_logits) last_logits = last_logits[:, : self.config.vocab_size].float() @@ -204,7 +206,7 @@ def forward( ) else: all_logits = torch.matmul(hidden_states, weight.T) - if self.tp_size > 1: + if self.do_tensor_parallel_all_gather: all_logits = tensor_model_parallel_all_gather(all_logits) all_logits = all_logits[:, : self.config.vocab_size].float() diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9e86c9b188..f6706781db 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -21,7 +21,9 @@ from typing import List, Optional, Union import torch +import torch.distributed as dist from flashinfer.sampling import top_k_top_p_sampling_from_probs +from vllm.distributed import get_tensor_model_parallel_group import sglang.srt.sampling.penaltylib as penaltylib from sglang.global_config import global_config @@ -724,7 +726,7 @@ def merge(self, other: "ScheduleBatch"): ) self.logit_bias = torch.concat([self.logit_bias, other.logit_bias]) - def sample(self, logits: torch.Tensor): + def sample(self, logits: torch.Tensor, is_multi_node_tp=False): # TODO(lsyin): move this into a part of layer and run with CUDA Graph # Post process logits logits = logits.contiguous() @@ -779,6 +781,16 @@ def sample(self, logits: torch.Tensor): self.penalizer_orchestrator.cumulate_output_tokens(batch_next_token_ids) + if is_multi_node_tp: + # If the tensor parallelism spans across multiple nodes, there is some indeterminism + # that can cause the TP workers to generate different tokens, so we need to + # sync here + torch.distributed.all_reduce( + batch_next_token_ids, + op=dist.ReduceOp.MIN, + group=get_tensor_model_parallel_group().device_group, + ) + return batch_next_token_ids diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 4d869c5919..945a4c95e1 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -85,10 +85,6 @@ def __init__( self.schedule_policy = server_args.schedule_policy self.disable_regex_jump_forward = server_args.disable_regex_jump_forward - # Chunked prefill - self.chunked_prefill_size = server_args.chunked_prefill_size - self.current_inflight_req = None - # Init model and tokenizer self.model_config = ModelConfig( server_args.model_path, @@ -175,6 +171,10 @@ def __init__( self.num_generated_tokens = 0 self.last_stats_tic = time.time() + # Chunked prefill + self.chunked_prefill_size = server_args.chunked_prefill_size + self.current_inflight_req = None + # Init the FSM cache for constrained generation if not server_args.skip_tokenizer_init: self.regex_fsm_cache = FSMCache( @@ -444,7 +444,9 @@ def forward_prefill_batch(self, batch: ScheduleBatch): # Forward and sample the next tokens if batch.extend_num_tokens != 0: output = self.model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample(output.next_token_logits) + next_token_ids = batch.sample( + output.next_token_logits, self.model_runner.is_multi_node_tp + ) # Move logprobs to cpu if output.next_token_logprobs is not None: @@ -603,7 +605,9 @@ def forward_decode_batch(self, batch: ScheduleBatch): # Forward and sample the next tokens output = self.model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample(output.next_token_logits) + next_token_ids = batch.sample( + output.next_token_logits, self.model_runner.is_multi_node_tp + ) # Move logprobs to cpu if output.next_token_logprobs is not None: diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 3d4e5d4c6a..af39065cfa 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -142,7 +142,7 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): set_torch_compile_config() def can_run(self, batch_size): - return batch_size < self.max_bs + return batch_size <= self.max_bs def capture(self, batch_size_list): self.batch_size_list = batch_size_list @@ -239,12 +239,23 @@ def run_once(): return forward(input_ids, input_metadata.positions, input_metadata) for _ in range(2): + torch.cuda.synchronize() + self.model_runner.tp_group.barrier() + run_once() + torch.cuda.synchronize() + self.model_runner.tp_group.barrier() + torch.cuda.synchronize() + self.model_runner.tp_group.barrier() + with torch.cuda.graph(graph, pool=self.graph_memory_pool, stream=stream): out = run_once() + torch.cuda.synchronize() + self.model_runner.tp_group.barrier() + self.graph_memory_pool = graph.pool() return graph, None, out, flashinfer_decode_wrapper @@ -278,7 +289,9 @@ def replay(self, batch: ScheduleBatch): ) # Replay + torch.cuda.synchronize() self.graphs[bs].replay() + torch.cuda.synchronize() output = self.output_buffers[bs] # Unpad diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 7af4ec2ddd..2de432144e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -38,6 +38,7 @@ init_distributed_environment, initialize_model_parallel, ) +from vllm.distributed.parallel_state import in_the_same_node_as from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import ModelRegistry @@ -112,10 +113,13 @@ def __init__( distributed_init_method=nccl_init_method, ) initialize_model_parallel(tensor_model_parallel_size=self.tp_size) - self.tp_group = get_tp_group() total_gpu_memory = get_available_gpu_memory( self.gpu_id, distributed=self.tp_size > 1 ) + self.tp_group = get_tp_group() + self.is_multi_node_tp = not all( + in_the_same_node_as(self.tp_group.cpu_group, source_rank=0) + ) if self.tp_size > 1: total_local_gpu_memory = get_available_gpu_memory(self.gpu_id) diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index eff746f1dd..75b086fd6a 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -295,8 +295,9 @@ def __init__( self.config = config self.quant_config = quant_config self.model = Grok1Model(config, quant_config=quant_config) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.logits_processor = LogitsProcessor(config) + # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.lm_head = ReplicatedLinear(config.hidden_size, config.vocab_size) + self.logits_processor = LogitsProcessor(config, skip_all_gather=True) # Monkey patch _prepare_weights to load pre-sharded weights setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights) From 3694f8f996e25c862cd67057e2bfa5844900fc98 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Fri, 16 Aug 2024 02:13:00 -0700 Subject: [PATCH 043/118] Mixed style of chunked prefill (#1013) --- .../sglang/srt/managers/policy_scheduler.py | 7 +- python/sglang/srt/managers/schedule_batch.py | 27 +++++++ python/sglang/srt/managers/tp_worker.py | 46 +++++++++--- .../srt/model_executor/forward_batch_info.py | 38 +++++----- python/sglang/srt/server.py | 9 --- python/sglang/srt/server_args.py | 6 ++ python/sglang/test/simple_eval_common.py | 19 +++-- python/sglang/test/simple_eval_gpqa.py | 3 +- python/sglang/test/simple_eval_humaneval.py | 4 +- python/sglang/test/simple_eval_math.py | 3 +- python/sglang/test/simple_eval_mmlu.py | 3 +- test/srt/test_chunked_prefill.py | 15 +++- ...est_eval_accuracy_large_chunked_prefill.py | 1 - ...al_accuracy_large_mixed_chunked_prefill.py | 73 +++++++++++++++++++ 14 files changed, 195 insertions(+), 59 deletions(-) create mode 100644 test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py diff --git a/python/sglang/srt/managers/policy_scheduler.py b/python/sglang/srt/managers/policy_scheduler.py index 9d5f991975..04169e8086 100644 --- a/python/sglang/srt/managers/policy_scheduler.py +++ b/python/sglang/srt/managers/policy_scheduler.py @@ -111,11 +111,14 @@ def __init__( rem_total_tokens: int, rem_input_tokens: int, rem_chunk_tokens: Optional[int], + mixed_with_decode_tokens: int = 0, ): self.tree_cache = tree_cache - self.rem_total_tokens = rem_total_tokens - self.rem_input_tokens = rem_input_tokens + self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens + self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens self.rem_chunk_tokens = rem_chunk_tokens + if self.rem_chunk_tokens is not None: + self.rem_chunk_tokens -= mixed_with_decode_tokens self.can_run_list = [] self.new_inflight_req = None diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index f6706781db..42c291bb17 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -329,6 +329,9 @@ class ScheduleBatch: out_cache_loc: torch.Tensor = None extend_num_tokens: int = None + # For mixed chunekd prefill + prefix_lens_cpu: List[int] = None + # For processing logprobs return_logprob: bool = False top_logprobs_nums: List[int] = None @@ -462,9 +465,33 @@ def prepare_for_extend(self, vocab_size: int): self.extend_num_tokens = extend_num_tokens self.out_cache_loc = out_cache_loc self.top_logprobs_nums = [r.top_logprobs_num for r in reqs] + self.prefix_lens_cpu = [len(r.prefix_indices) for r in reqs] self.batch_sampling_params(vocab_size) + def mix_with_running(self, running_batch: "ScheduleBatch"): + # NOTE: prefix_indices is what has been cached, but we don't cache each decode step + prefix_lens_cpu = [len(r.prefix_indices) for r in self.reqs] + prefix_lens_cpu.extend( + [ + len(r.origin_input_ids) + len(r.output_ids) - 1 + for r in running_batch.reqs + ] + ) + + for req in running_batch.reqs: + req.fill_ids = req.origin_input_ids + req.output_ids + req.extend_input_len = 1 + + input_ids = torch.cat([self.input_ids, running_batch.input_ids]) + out_cache_loc = torch.cat([self.out_cache_loc, running_batch.out_cache_loc]) + extend_num_tokens = self.extend_num_tokens + running_batch.batch_size() + self.merge(running_batch) + self.input_ids = input_ids + self.out_cache_loc = out_cache_loc + self.extend_num_tokens = extend_num_tokens + self.prefix_lens_cpu = prefix_lens_cpu + def check_decode_mem(self): bs = self.batch_size() if self.token_to_kv_pool.available_size() >= bs: diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 945a4c95e1..b6cfa68bd4 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -174,6 +174,9 @@ def __init__( # Chunked prefill self.chunked_prefill_size = server_args.chunked_prefill_size self.current_inflight_req = None + self.is_mixed_chunk = ( + self.chunked_prefill_size is not None and server_args.enable_mixed_chunk + ) # Init the FSM cache for constrained generation if not server_args.skip_tokenizer_init: @@ -366,11 +369,14 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: # Get priority queue prefix_computed = self.scheduler.calc_priority(self.waiting_queue) + num_mixed_running = running_bs if self.is_mixed_chunk else 0 + adder = PrefillAdder( self.tree_cache, self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size(), self.max_prefill_tokens, self.chunked_prefill_size, + num_mixed_running, ) if self.running_batch is not None: @@ -416,15 +422,27 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: ) else: tree_cache_hit_rate = 0.0 - logger.info( - f"[gpu={self.gpu_id}] Prefill batch. " - f"#new-seq: {len(can_run_list)}, " - f"#new-token: {adder.log_input_tokens}, " - f"#cached-token: {adder.log_hit_tokens}, " - f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, " - f"#running-req: {running_bs}, " - f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}" - ) + + if num_mixed_running > 0: + logger.info( + f"[gpu={self.gpu_id}] Prefill batch" + f"(mixed #running-req: {num_mixed_running}). " + f"#new-seq: {len(can_run_list)}, " + f"#new-token: {adder.log_input_tokens}, " + f"#cached-token: {adder.log_hit_tokens}, " + f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, " + f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}" + ) + else: + logger.info( + f"[gpu={self.gpu_id}] Prefill batch. " + f"#new-seq: {len(can_run_list)}, " + f"#new-token: {adder.log_input_tokens}, " + f"#cached-token: {adder.log_hit_tokens}, " + f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, " + f"#running-req: {running_bs}, " + f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}" + ) # Return the new batch new_batch = ScheduleBatch.init_new( @@ -440,6 +458,13 @@ def forward_prefill_batch(self, batch: ScheduleBatch): # Build batch tensors batch.prepare_for_extend(self.model_config.vocab_size) + decoding_reqs = [] + if self.is_mixed_chunk and self.running_batch is not None: + self.running_batch.prepare_for_decode() + batch.mix_with_running(self.running_batch) + decoding_reqs = self.running_batch.reqs + self.running_batch = None + if self.model_runner.is_generation: # Forward and sample the next tokens if batch.extend_num_tokens != 0: @@ -481,7 +506,8 @@ def forward_prefill_batch(self, batch: ScheduleBatch): if req.finished(): self.tree_cache.cache_finished_req(req) - else: + elif req not in decoding_reqs: + # To reduce overhead, only cache prefill reqs self.tree_cache.cache_unfinished_req(req) if req is self.current_inflight_req: diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index ce5ea25eaa..3cf68eab24 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -88,11 +88,11 @@ def init_multimuldal_info(self, batch: ScheduleBatch): self.image_sizes = [r.image_size for r in reqs] self.image_offsets = [ ( - (r.image_offset - len(r.prefix_indices)) + (r.image_offset - batch.prefix_lens_cpu[i]) if r.image_offset is not None else 0 ) - for r in reqs + for i, r in enumerate(reqs) ] def compute_positions(self, batch: ScheduleBatch): @@ -109,8 +109,8 @@ def compute_positions(self, batch: ScheduleBatch): self.positions = torch.tensor( np.concatenate( [ - np.arange(len(req.prefix_indices), len(req.fill_ids)) - for req in batch.reqs + np.arange(batch.prefix_lens_cpu[i], len(req.fill_ids)) + for i, req in enumerate(batch.reqs) ], axis=0, ), @@ -123,7 +123,7 @@ def compute_positions(self, batch: ScheduleBatch): np.concatenate( [ np.arange( - len(req.prefix_indices) + position_ids_offsets_cpu[i], + batch.prefix_lens_cpu[i] + position_ids_offsets_cpu[i], len(req.fill_ids) + position_ids_offsets_cpu[i], ) for i, req in enumerate(batch.reqs) @@ -141,12 +141,13 @@ def compute_extend_infos(self, batch: ScheduleBatch): self.extend_seq_lens = self.extend_start_loc = self.extend_no_prefix = None else: extend_lens_cpu = [ - len(r.fill_ids) - len(r.prefix_indices) for r in batch.reqs + len(r.fill_ids) - batch.prefix_lens_cpu[i] + for i, r in enumerate(batch.reqs) ] self.extend_seq_lens = torch.tensor(extend_lens_cpu, device="cuda") self.extend_start_loc = torch.zeros_like(self.seq_lens) self.extend_start_loc[1:] = torch.cumsum(self.extend_seq_lens[:-1], dim=0) - self.extend_no_prefix = all(len(r.prefix_indices) == 0 for r in batch.reqs) + self.extend_no_prefix = all(l == 0 for l in batch.prefix_lens_cpu) @classmethod def from_schedule_batch( @@ -180,14 +181,8 @@ def from_schedule_batch( if forward_mode != ForwardMode.DECODE: ret.init_multimuldal_info(batch) - prefix_lens = None - if forward_mode != ForwardMode.DECODE: - prefix_lens = torch.tensor( - [len(r.prefix_indices) for r in batch.reqs], device="cuda" - ) - if model_runner.server_args.disable_flashinfer: - ret.init_triton_args(batch, prefix_lens) + ret.init_triton_args(batch) flashinfer_use_ragged = False if not model_runner.server_args.disable_flashinfer: @@ -198,30 +193,35 @@ def from_schedule_batch( ): flashinfer_use_ragged = True ret.init_flashinfer_handlers( - model_runner, prefix_lens, flashinfer_use_ragged + model_runner, batch.prefix_lens_cpu, flashinfer_use_ragged ) return ret - def init_triton_args(self, batch: ScheduleBatch, prefix_lens): + def init_triton_args(self, batch: ScheduleBatch): """Init auxiliary variables for triton attention backend.""" self.triton_max_seq_len = int(torch.max(self.seq_lens)) - self.triton_prefix_lens = prefix_lens self.triton_start_loc = torch.zeros_like(self.seq_lens, dtype=torch.int32) self.triton_start_loc[1:] = torch.cumsum(self.seq_lens[:-1], dim=0) if self.forward_mode == ForwardMode.DECODE: self.triton_max_extend_len = None else: - extend_seq_lens = self.seq_lens - prefix_lens + self.triton_prefix_lens = torch.tensor(batch.prefix_lens_cpu, device="cuda") + extend_seq_lens = self.seq_lens - self.triton_prefix_lens self.triton_max_extend_len = int(torch.max(extend_seq_lens)) def init_flashinfer_handlers( self, model_runner, - prefix_lens, + prefix_lens_cpu, flashinfer_use_ragged, ): + if self.forward_mode != ForwardMode.DECODE: + prefix_lens = torch.tensor(prefix_lens_cpu, device="cuda") + else: + prefix_lens = None + update_flashinfer_indices( self.forward_mode, model_runner, diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 4f06f76300..6bbf3050ae 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -445,15 +445,6 @@ def _wait_and_warmup(server_args, pipe_finish_writer): print(f"Initialization failed. warmup error: {last_traceback}", flush=True) sys.exit(1) - # Print warnings here - if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None: - logger.warning( - "You set both `--disable-radix-cache` and `--chunked-prefill-size`. " - "This combination is an experimental feature and we noticed it can lead to " - "wrong generation results. If you want to use chunked prefill, it is recommended " - "not using `--disable-radix-cache`." - ) - logger.info("The server is fired up and ready to roll!") if pipe_finish_writer is not None: pipe_finish_writer.send("init ok") diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 738ab7d1ab..99ecff6a58 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -80,6 +80,7 @@ class ServerArgs: disable_regex_jump_forward: bool = False disable_cuda_graph: bool = False disable_disk_cache: bool = False + enable_mixed_chunk: bool = False enable_torch_compile: bool = False enable_p2p_check: bool = False enable_mla: bool = False @@ -396,6 +397,11 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Disable disk cache to avoid possible crashes related to file system or high concurrency.", ) + parser.add_argument( + "--enable-mixed-chunk", + action="store_true", + help="Enabling mixing prefill and decode in a chunked batch.", + ) parser.add_argument( "--enable-torch-compile", action="store_true", diff --git a/python/sglang/test/simple_eval_common.py b/python/sglang/test/simple_eval_common.py index 4cfd3515fe..d97d84de93 100644 --- a/python/sglang/test/simple_eval_common.py +++ b/python/sglang/test/simple_eval_common.py @@ -1,13 +1,12 @@ # Adapted from https://github.com/openai/simple-evals/ -import base64 import os import resource import time from collections import defaultdict from dataclasses import dataclass, field from multiprocessing.pool import ThreadPool -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple import httpx import jinja2 @@ -44,8 +43,8 @@ class EvalResult: Result of running an evaluation (usually consisting of many samples) """ - score: float | None # top-line metric - metrics: Dict[str, float] | None # other metrics + score: Optional[float] # top-line metric + metrics: Optional[Dict[str, float]] # other metrics htmls: List[str] # strings of valid HTML convos: List[MessageList] # sampled conversations @@ -56,10 +55,10 @@ class SingleEvalResult: Result of evaluating a single sample """ - score: float | None + score: Optional[float] metrics: Dict[str, float] = field(default_factory=dict) - html: str | None = None - convo: MessageList | None = None # sampled conversation + html: Optional[str] = None + convo: Optional[MessageList] = None # sampled conversation class Eval: @@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase): def __init__( self, base_url: str = None, - model: str | None = None, - system_message: str | None = None, + model: Optional[str] = None, + system_message: Optional[str] = None, temperature: float = 0.0, max_tokens: int = 2048, ): @@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str): def aggregate_results( single_eval_results: List[SingleEvalResult], default_stats: Tuple[str] = ("mean", "std"), - name2stats: Dict[str, Tuple[str]] | None = None, + name2stats: Optional[Dict[str, Tuple[str]]] = None, ) -> EvalResult: """ Aggregate results from multiple evaluations into a single EvalResult. diff --git a/python/sglang/test/simple_eval_gpqa.py b/python/sglang/test/simple_eval_gpqa.py index 46055caa5f..ec2abb4adc 100644 --- a/python/sglang/test/simple_eval_gpqa.py +++ b/python/sglang/test/simple_eval_gpqa.py @@ -8,6 +8,7 @@ import random import re +from typing import Optional import pandas @@ -28,7 +29,7 @@ class GPQAEval(Eval): def __init__( self, filename: str, - num_examples: int | None, + num_examples: Optional[int], num_threads: int, n_repeats: int = 1, ): diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py index efb0d0bd6f..b0ad79d413 100644 --- a/python/sglang/test/simple_eval_humaneval.py +++ b/python/sglang/test/simple_eval_humaneval.py @@ -9,7 +9,7 @@ import random import re from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, List +from typing import Dict, List, Optional import tqdm @@ -61,7 +61,7 @@ def evaluate_functional_correctness( class HumanEval(Eval): def __init__( self, - num_examples: int | None, + num_examples: Optional[int], num_threads: int, num_samples_per_task: int = 5, ks_passes: List[int] = [1, 2, 5], diff --git a/python/sglang/test/simple_eval_math.py b/python/sglang/test/simple_eval_math.py index 4ddb650d96..74c49abe51 100644 --- a/python/sglang/test/simple_eval_math.py +++ b/python/sglang/test/simple_eval_math.py @@ -8,6 +8,7 @@ import random import re +from typing import Optional import pandas @@ -36,7 +37,7 @@ def __init__( self, filename: str, equality_checker: SamplerBase, - num_examples: int | None, + num_examples: Optional[int], num_threads: int, ): df = pandas.read_csv(filename) diff --git a/python/sglang/test/simple_eval_mmlu.py b/python/sglang/test/simple_eval_mmlu.py index 3c0287510c..36a5c7fe35 100644 --- a/python/sglang/test/simple_eval_mmlu.py +++ b/python/sglang/test/simple_eval_mmlu.py @@ -8,6 +8,7 @@ import random import re +from typing import Optional import pandas @@ -84,7 +85,7 @@ class MMLUEval(Eval): - def __init__(self, filename: str, num_examples: int | None, num_threads: int): + def __init__(self, filename: str, num_examples: Optional[int], num_threads: int): df = pandas.read_csv(filename) examples = [row.to_dict() for _, row in df.iterrows()] if num_examples: diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 94c4247624..8d81dc0c3e 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -11,11 +11,14 @@ class TestChunkedPrefill(unittest.TestCase): - def run_mmlu(self, disable_radix_cache): + def run_mmlu(self, disable_radix_cache, enable_mixed_chunk): other_args = ["--chunked-prefill-size", "32"] if disable_radix_cache: other_args += ["--disable-radix-cache"] + if enable_mixed_chunk: + other_args += ["--enable-mixed-chunk"] + model = DEFAULT_MODEL_NAME_FOR_TEST base_url = DEFAULT_URL_FOR_UNIT_TEST process = popen_launch_server( @@ -40,10 +43,16 @@ def run_mmlu(self, disable_radix_cache): kill_child_process(process.pid) def test_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=False) + self.run_mmlu(disable_radix_cache=False, enable_mixed_chunk=False) + + def test_mixed_chunked_prefill(self): + self.run_mmlu(disable_radix_cache=False, enable_mixed_chunk=True) def test_chunked_prefill_without_radix_cache(self): - self.run_mmlu(disable_radix_cache=True) + self.run_mmlu(disable_radix_cache=True, enable_mixed_chunk=False) + + def test_mixed_chunked_prefill_without_radix_cache(self): + self.run_mmlu(disable_radix_cache=True, enable_mixed_chunk=True) if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index 040a2db75f..bf4d071b88 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -6,7 +6,6 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_URL_FOR_ACCURACY_TEST, - DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server, ) diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py new file mode 100644 index 0000000000..b4d7602c42 --- /dev/null +++ b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py @@ -0,0 +1,73 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_ACCURACY_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + other_args=[ + "--log-level-http", + "warning", + "--chunked-prefill-size", + "256", + "--enable-mixed-chunk", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=3000, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.71, f"{metrics}" + + def test_human_eval(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="humaneval", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.64, f"{metrics}" + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.84, f"{metrics}" + + +if __name__ == "__main__": + unittest.main() From f624f6a6cc0a5578b9ef056b610e54e04518b26c Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Fri, 16 Aug 2024 15:12:38 -0700 Subject: [PATCH 044/118] Fix port conflicts between local CI and runner CI. (#1131) --- python/sglang/test/test_utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 66f3e4f35e..64bc4ea7cc 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -3,6 +3,7 @@ import argparse import asyncio import multiprocessing +import os import subprocess import threading import time @@ -22,10 +23,17 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" -DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" -DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" -DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" -DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157" + +if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" + DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" + DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" + DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157" +else: + DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:1157" + DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:1257" + DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:1357" + DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:1457" def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): From 5d0d40d0eb8c347d8b3598f0a375696728df66c4 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Fri, 16 Aug 2024 21:41:11 -0700 Subject: [PATCH 045/118] Fix CI accuracy && time out limit (#1133) --- .github/workflows/unit-test.yml | 2 +- test/srt/test_eval_accuracy_large.py | 2 +- test/srt/test_eval_accuracy_large_chunked_prefill.py | 2 +- test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 4b61c4c4ed..3422cde40d 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -41,7 +41,7 @@ jobs: run: | cd test/srt python3 run_suite.py --suite minimal - timeout-minutes: 15 + timeout-minutes: 18 - name: Test Frontend Language run: | diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 9f99b0b95d..470ed11aa4 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -37,7 +37,7 @@ def test_mmlu(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.71, f"{metrics}" + assert metrics["score"] >= 0.705, f"{metrics}" def test_human_eval(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index bf4d071b88..951f481da3 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -36,7 +36,7 @@ def test_mmlu(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.71, f"{metrics}" + assert metrics["score"] >= 0.705, f"{metrics}" def test_human_eval(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py index b4d7602c42..210c32b519 100644 --- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py @@ -42,7 +42,7 @@ def test_mmlu(self): ) metrics = run_eval(args) - assert metrics["score"] >= 0.71, f"{metrics}" + assert metrics["score"] >= 0.705, f"{metrics}" def test_human_eval(self): args = SimpleNamespace( From 9208591f05c39963f423fb3fee841f94276da187 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sat, 17 Aug 2024 22:45:42 +0800 Subject: [PATCH 046/118] fix: use fp16 dtype for sm75 (#1136) --- python/sglang/srt/model_executor/model_runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 2de432144e..b74a19e60d 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -148,6 +148,11 @@ def load_model(self): f"[gpu={self.gpu_id}] Load weight begin. " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) + if torch.cuda.get_device_capability()[0] < 8: + logger.info( + "Compute capability below sm80 use float16 due to lack of bfloat16 support." + ) + self.server_args.dtype = "float16" monkey_patch_vllm_dummy_weight_loader() device_config = DeviceConfig() From cdc8d607524a9cf663d2319ff452168d99645e39 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 17 Aug 2024 14:37:52 -0700 Subject: [PATCH 047/118] Improve the code style: more comments and remove useless packages (#1139) --- .../srt/managers/detokenizer_manager.py | 4 +- python/sglang/srt/managers/io_struct.py | 37 ++++++++++++++----- python/sglang/srt/server.py | 1 - 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index 08ccfd5cef..12511ac44e 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -17,7 +17,6 @@ import asyncio import dataclasses -import inspect from typing import List import uvloop @@ -126,8 +125,6 @@ async def handle_loop(self): spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0], ) - # Trim stop str - # TODO(lmzheng): handle the case where multiple stop strs are hit output_strs = [] for i in range(bs): s = self.decode_status[recv_obj.rids[i]] @@ -144,6 +141,7 @@ async def handle_loop(self): output_strs.append(s.decoded_text + new_text) + # Trim stop str. TODO(lmzheng): handle the case where multiple stop strs are hit if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR): pos = output_strs[i].find(recv_obj.finished_reason[i].matched) if pos != -1: diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 2d12505ae4..82f280b606 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -22,8 +22,6 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Union -import torch - from sglang.srt.managers.schedule_batch import BaseFinishReason from sglang.srt.sampling_params import SamplingParams @@ -43,9 +41,9 @@ class GenerateReqInput: rid: Optional[Union[List[str], str]] = None # Whether to return logprobs. return_logprob: Optional[Union[List[bool], bool]] = None - # The start location of the prompt for return_logprob. + # If return logprobs, the start location in the prompt for returning logprobs. logprob_start_len: Optional[Union[List[int], int]] = None - # The number of top logprobs to return. + # If return logprobs, the number of top logprobs to return at each position. top_logprobs_num: Optional[Union[List[int], int]] = None # Whether to detokenize tokens in text in the returned logprobs. return_text_in_logprobs: bool = False @@ -155,16 +153,27 @@ def post_init(self): @dataclass class TokenizedGenerateReqInput: + # The request id rid: str + # The input text input_text: str + # The input token ids input_ids: List[int] + # The pixel values for input images pixel_values: List[float] + # The hash of input images image_hash: int + # The image size image_size: List[int] + # The sampling parameters sampling_params: SamplingParams + # Whether to return the logprobs return_logprob: bool + # If return logprobs, the start location in the prompt for returning logprobs. logprob_start_len: int + # If return logprobs, the number of top logprobs to return at each position. top_logprobs_num: int + # Whether to stream output stream: bool @@ -215,15 +224,21 @@ def post_init(self): @dataclass class TokenizedEmbeddingReqInput: + # The request id rid: str + # The input text input_text: str + # The input token ids input_ids: List[int] + # Dummy sampling params for compatibility sampling_params: SamplingParams @dataclass class BatchTokenIDOut: + # The request id rids: List[str] + # The version id to sync decode status with in detokenizer_manager vids: List[int] decoded_texts: List[str] decode_ids: List[int] @@ -236,17 +251,25 @@ class BatchTokenIDOut: @dataclass class BatchStrOut: + # The request id rids: List[str] + # The output decoded strings output_strs: List[str] + # The meta info meta_info: List[Dict] + # The finish reason finished_reason: List[BaseFinishReason] @dataclass class BatchEmbeddingOut: + # The request id rids: List[str] + # The output embedding embeddings: List[List[float]] + # The meta info meta_info: List[Dict] + # The finish reason finished_reason: List[BaseFinishReason] @@ -257,9 +280,5 @@ class FlushCacheReq: @dataclass class AbortReq: + # The request id rid: str - - -@dataclass -class DetokenizeReqInput: - input_ids: List[int] diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 6bbf3050ae..9028c12309 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -34,7 +34,6 @@ setattr(threading, "_register_atexit", lambda *args, **kwargs: None) import aiohttp -import psutil import requests import uvicorn import uvloop From 57d0bd91ec1775cd150629db14d39e07a876a45b Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 17 Aug 2024 17:43:23 -0700 Subject: [PATCH 048/118] Improve benchmark (#1140) --- benchmark/gsm8k/bench_other.py | 3 +- benchmark/latency_throughput/README.md | 105 ----- benchmark/latency_throughput/bench_one.py | 147 ------- benchmark/latency_throughput/bench_serving.py | 374 ------------------ docs/en/benchmark_and_profiling.md | 49 +++ python/sglang/bench_serving.py | 74 ++-- python/sglang/srt/hf_transformers_utils.py | 12 +- python/sglang/test/test_utils.py | 25 +- 8 files changed, 111 insertions(+), 678 deletions(-) delete mode 100644 benchmark/latency_throughput/README.md delete mode 100644 benchmark/latency_throughput/bench_one.py delete mode 100644 benchmark/latency_throughput/bench_serving.py create mode 100644 docs/en/benchmark_and_profiling.md diff --git a/benchmark/gsm8k/bench_other.py b/benchmark/gsm8k/bench_other.py index c80c17a249..2a938d6bb9 100644 --- a/benchmark/gsm8k/bench_other.py +++ b/benchmark/gsm8k/bench_other.py @@ -65,10 +65,9 @@ def main(args): def get_one_answer(i): answer = call_generate( prompt=few_shot_examples + questions[i], - # prompt="System: " + few_shot_examples + "<|separator|>\n\n" + questions[i], temperature=0, max_tokens=256, - stop="Question", + stop=["Question", "Assistant:", "<|separator|>"], ) states[i] = answer diff --git a/benchmark/latency_throughput/README.md b/benchmark/latency_throughput/README.md deleted file mode 100644 index b1061793aa..0000000000 --- a/benchmark/latency_throughput/README.md +++ /dev/null @@ -1,105 +0,0 @@ -# Benchmark Latency and Throughput - -## SGLang - -### Launch a server -``` -python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 -``` - -### Benchmark one batch - -``` -python3 bench_one.py -python3 bench_one.py --batch-size 64 -``` - -### Benchmark online serving with many requests - -``` -python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 -``` - -### Benchmark online serving on the ShareGPT dataset - -#### Download data -``` -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -``` - -#### Run ShareGPT -``` -python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 -``` - -### Profile with Nsight -0. Prerequisite -```bash -# install nsys -# https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html -apt update -apt install -y --no-install-recommends gnupg -echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list -apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub -apt update -apt install nsight-systems-cli -``` - -1. To profile a single batch, use `nsys profile --cuda-graph-trace=node python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512` - -2. To profile a server, e.g. - -```bash -# server -# set the delay and duration times according to needs -nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache - -# client -python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048 -``` - -3. Use NVTX, e.g. - -```bash -# install nvtx -pip install nvtx - -# code snippets -import nvtx -with nvtx.annotate("description", color="color"): - # some critical code -``` - - -## Other baselines - -### vLLM -``` -python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel 1 --disable-log-requests --swap-space 16 --port 21000 -``` - -``` -# run synthetic -python3 bench_serving.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 -``` - -``` -# run ShareGPT -python3 bench_serving.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 -``` - -``` -# run one batch -python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B --tensor 8 --disable-log-requests --max-num-seqs 1024 --quantization fp8 - -python3 bench_one.py --input-len 1024 --batch-size 1 1 2 4 8 16 32 64 128 256 512 768 1024 --port 8000 --backend vllm -``` - -### LightLLM -``` -python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat-hf --max_total_token_num 15600 --tokenizer_mode auto --port 22000 -``` - -``` -python3 bench_serving.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 -``` diff --git a/benchmark/latency_throughput/bench_one.py b/benchmark/latency_throughput/bench_one.py deleted file mode 100644 index b390c44a53..0000000000 --- a/benchmark/latency_throughput/bench_one.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Usage: -python3 bench_one.py --input-len 2048 --batch-size 1 2 4 8 16 32 64 128 256 512 -""" - -import argparse -import json -import time - -import numpy as np -import requests - - -def run_one_batch_size(bs): - url = f"{args.host}:{args.port}" - max_new_tokens = args.max_tokens - - if args.input_len: - input_ids = [ - [int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] - for _ in range(bs) - ] - else: - text = [f"{i, }" for i in range(bs)] - - tic = time.time() - if args.backend == "srt": - if args.input_len: - inputs = {"input_ids": input_ids} - else: - inputs = {"text": text} - - response = requests.post( - url + "/generate", - json={ - "sampling_params": { - "temperature": 0, - "max_new_tokens": max_new_tokens, - "ignore_eos": True, - }, - **inputs, - }, - ) - elif args.backend == "lightllm": - response = requests.post( - url + "/generate", - json={ - "inputs": text[0], - "parameters": { - "temperature": 0, - "max_new_tokens": max_new_tokens, - "ignore_eos": True, - }, - }, - ) - elif args.backend == "vllm": - if args.input_len: - inputs = {"prompt": input_ids} - else: - inputs = {"prompt": text} - - response = requests.post( - url + "/v1/completions", - json={ - "model": args.vllm_model_name, - "temperature": 0, - "max_tokens": max_new_tokens, - "ignore_eos": True, - **inputs, - }, - ) - elif args.backend == "ginfer": - import grpc - from ginfer import sampler_pb2, sampler_pb2_grpc - - sampler_channel = grpc.insecure_channel(url.replace("http://", "")) - sampler = sampler_pb2_grpc.SamplerStub(sampler_channel) - - tic = time.time() - sample_request = sampler_pb2.SampleTextRequest( - prompt=text[0], - settings=sampler_pb2.SampleSettings( - max_len=max_new_tokens, - rng_seed=0, - temperature=0, - nucleus_p=1, - ), - ) - stream = sampler.SampleText(sample_request) - response = "".join([x.text for x in stream]) - latency = time.time() - tic - - if isinstance(response, str): - ret = response - else: - ret = response.json() - print(ret) - - input_len = args.input_len if args.input_len else 1 - output_len = max_new_tokens - - output_throughput = bs * max_new_tokens / latency - overall_throughput = bs * (input_len + output_len) / latency - print(f"latency: {latency:.2f} s") - print(f"output throughput: {output_throughput:.2f} token/s") - print(f"(input + output) throughput: {overall_throughput:.2f} token/s") - - with open("results.jsonl", "a") as fout: - res = { - "backend": args.backend, - "input_len": args.input_len, - "output_len": args.max_tokens, - "batch_size": bs, - "latency": latency, - "output_throughput": output_throughput, - "overall_throughput": overall_throughput, - } - fout.write(json.dumps(res) + "\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="http://127.0.0.1") - parser.add_argument("--port", type=int, default=None) - parser.add_argument("--backend", type=str, default="srt") - parser.add_argument("--input-len", type=int, default=None) - parser.add_argument("--batch-size", type=int, nargs="*", default=[1]) - parser.add_argument("--max-tokens", type=int, default=256) - parser.add_argument( - "--vllm-model-name", type=str, default="meta-llama/Meta-Llama-3-70B" - ) - args = parser.parse_args() - - if args.port is None: - if args.backend == "srt": - args.port = 30000 - elif args.backend == "vllm": - args.port = 21000 - elif args.backend == "lightllm": - args.port = 22000 - elif args.backend == "ginfer": - args.port = 9988 - else: - raise ValueError(f"Invalid backend: {args.backend}") - - for bs in args.batch_size: - run_one_batch_size(bs) diff --git a/benchmark/latency_throughput/bench_serving.py b/benchmark/latency_throughput/bench_serving.py deleted file mode 100644 index 74fafc9494..0000000000 --- a/benchmark/latency_throughput/bench_serving.py +++ /dev/null @@ -1,374 +0,0 @@ -"""Benchmark online serving throughput. - -On the server side, run one of the following commands: - (vLLM backend) - python -m vllm.entrypoints.api_server \ - --model --swap-space 16 \ - --disable-log-requests - - (TGI backend) - ./launch_hf_server.sh - -On the client side, run: - python benchmarks/benchmark_serving.py \ - --backend \ - --tokenizer --dataset \ - --request-rate -""" - -import argparse -import asyncio -import json -import os -import random -import time -from typing import AsyncGenerator, List, Tuple - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm_asyncio -from transformers import AutoTokenizer - -# (prompt len, output len, latency) -REQUEST_LATENCY: List[Tuple[int, int, float]] = [] - - -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: AutoTokenizer, -) -> List[Tuple[str, int, int]]: - def load_dataset(): - with open(dataset_path, encoding="utf-8") as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [ - (data["conversations"][0]["value"], data["conversations"][1]["value"]) - for data in dataset - ] - - # Tokenize the prompts and completions. - prompts = [prompt for prompt, _ in dataset] - prompt_token_ids = tokenizer(prompts).input_ids - completions = [completion for _, completion in dataset] - completion_token_ids = tokenizer(completions).input_ids - tokenized_dataset = [] - for i in range(len(dataset)): - output_len = len(completion_token_ids[i]) - tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) - - # Filter out too long sequences. - filtered_dataset: List[Tuple[str, int, int]] = [] - for prompt, prompt_token_ids, output_len in tokenized_dataset: - prompt_len = len(prompt_token_ids) - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - # This is because TGI causes errors when the input or output length - # is too short. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append((prompt, prompt_len, output_len)) - - return filtered_dataset - - try: - from diskcache import Cache - - home_dir = os.path.expanduser("~") - cache = Cache(f"{home_dir}/.cache/sglang") - with Cache(cache.directory) as reference: - reference_key = f"{dataset_path}_{tokenizer.name_or_path}" - if reference_key in reference: - print("Reading dataset from cache...") - dataset = reference[reference_key] - else: - dataset = load_dataset() - reference[reference_key] = dataset - except ImportError: - dataset = load_dataset() - - # Sample the requests. - sampled_requests = random.sample(dataset, num_requests) - return sampled_requests - - -async def get_request( - input_requests: List[Tuple[str, int, int]], - request_rate: float, -) -> AsyncGenerator[Tuple[str, int, int], None]: - input_requests = iter(input_requests) - for request in input_requests: - yield request - - if request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - -async def send_request( - backend: str, - api_url: str, - prompt: str, - prompt_len: int, - output_len: int, - best_of: int, - use_beam_search: bool, -) -> None: - request_start_time = time.perf_counter() - - headers = {"User-Agent": "Benchmark Client"} - if backend == "vllm": - pload = { - "prompt": prompt, - "n": 1, - "best_of": best_of, - "use_beam_search": use_beam_search, - "temperature": 0.0 if use_beam_search else 1.0, - "top_p": 1.0, - "max_tokens": output_len, - "ignore_eos": True, - "stream": False, - } - elif backend == "tgi": - assert not use_beam_search - params = { - "best_of": best_of, - "max_new_tokens": output_len, - "do_sample": True, - } - pload = { - "inputs": prompt, - "parameters": params, - } - elif backend == "srt": - assert not use_beam_search - params = { - "ignore_eos": True, - "max_new_tokens": output_len, - } - pload = { - "text": prompt, - "sampling_params": params, - } - elif backend == "lightllm": - assert not use_beam_search - params = { - "ignore_eos": True, - "max_new_tokens": output_len, - } - pload = { - "inputs": prompt, - "parameters": params, - } - elif backend == "ginfer": - pass - else: - raise ValueError(f"Unknown backend: {backend}") - - if backend != "ginfer": - timeout = aiohttp.ClientTimeout(total=3 * 3600) - async with aiohttp.ClientSession(timeout=timeout) as session: - while True: - async with session.post( - api_url, headers=headers, json=pload - ) as response: - chunks = [] - async for chunk, _ in response.content.iter_chunks(): - chunks.append(chunk) - output = b"".join(chunks).decode("utf-8") - output = json.loads(output) - - # Re-send the request if it failed. - if "error" not in output: - break - else: - print(output) - else: - import grpc - from ginfer import sampler_pb2, sampler_pb2_grpc - - api_url = api_url.replace("http://", "").replace("/generate", "") - sampler_channel = grpc.aio.insecure_channel(api_url) - sampler = sampler_pb2_grpc.SamplerStub(sampler_channel) - - request_end_time = time.perf_counter() - sample_request = sampler_pb2.SampleTextRequest( - prompt=prompt, - settings=sampler_pb2.SampleSettings( - max_len=output_len, - rng_seed=0, - temperature=0, - nucleus_p=1, - ), - ) - stream = sampler.SampleText(sample_request) - response = "".join([x.text async for x in stream]) - - request_end_time = time.perf_counter() - request_latency = request_end_time - request_start_time - REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) - - -async def benchmark( - backend: str, - api_url: str, - input_requests: List[Tuple[str, int, int]], - best_of: int, - use_beam_search: bool, - request_rate: float, -) -> None: - tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate): - prompt, prompt_len, output_len = request - task = asyncio.create_task( - send_request( - backend, - api_url, - prompt, - prompt_len, - output_len, - best_of, - use_beam_search, - ) - ) - tasks.append(task) - await tqdm_asyncio.gather(*tasks) - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - api_url = f"{args.host}:{args.port}/generate" - if args.tokenizer.endswith(".json") or args.tokenizer.endswith(".model"): - from sglang.srt.hf_transformers_utils import get_tokenizer - - tokenizer = get_tokenizer(args.tokenizer) - else: - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - - if args.dataset: - input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) - else: - input_lens = np.random.randint( - int(args.input_len * args.range_ratio), - args.input_len + 1, - size=args.num_prompts, - ) - output_lens = np.random.randint( - int(args.output_len * args.range_ratio), - args.output_len + 1, - size=args.num_prompts, - ) - offsets = np.random.randint(0, tokenizer.vocab_size, size=args.num_prompts) - input_requests = [] - for i in range(args.num_prompts): - prompt = tokenizer.decode( - [ - (offsets[i] + i + j) % (tokenizer.vocab_size - 129) + 128 - for j in range(input_lens[i]) - ] - ) - input_requests.append((prompt, int(input_lens[i]), int(output_lens[i]))) - - benchmark_start_time = time.perf_counter() - asyncio.run( - benchmark( - args.backend, - api_url, - input_requests, - args.best_of, - args.use_beam_search, - args.request_rate, - ) - ) - benchmark_end_time = time.perf_counter() - benchmark_time = benchmark_end_time - benchmark_start_time - - # Compute the statistics. - latencies = [latency for _, _, latency in REQUEST_LATENCY] - avg_latency = np.mean(latencies) - avg_per_token_latency = np.mean( - [ - latency / (prompt_len + output_len) - for prompt_len, output_len, latency in REQUEST_LATENCY - ] - ) - avg_per_output_token_latency = np.mean( - [latency / output_len for _, output_len, latency in REQUEST_LATENCY] - ) - decoding_throughput = ( - np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time - ) - - # latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY] - # print(latencies) - - print(f"Total time: {benchmark_time:.2f} s") - print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s") - print(f"Decoding throughput: {decoding_throughput:.2f} token/s") - print(f"Average latency: {avg_latency:.2f} s") - print(f"Average latency per token: {avg_per_token_latency:.2f} s") - print(f"Average latency per output token: {avg_per_output_token_latency:.2f} s") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Benchmark the online serving throughput." - ) - parser.add_argument( - "--backend", - type=str, - default="srt", - choices=["vllm", "tgi", "srt", "lightllm", "ginfer"], - ) - parser.add_argument("--host", type=str, default="http://localhost") - parser.add_argument("--port", type=int, default=30000) - parser.add_argument("--dataset", type=str, help="Path to the dataset.") - parser.add_argument("--input-len", type=int, default=2048) - parser.add_argument("--output-len", type=int, default=256) - parser.add_argument("--range-ratio", type=float, default=1.0) - parser.add_argument( - "--tokenizer", - type=str, - default="NousResearch/Meta-Llama-3-8B", - help="Name or path of the tokenizer.", - ) - parser.add_argument( - "--best-of", - type=int, - default=1, - help="Generates `best_of` sequences per prompt and " "returns the best one.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-prompts", type=int, default=1000, help="Number of prompts to process." - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.", - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="trust remote code from huggingface", - ) - args = parser.parse_args() - main(args) diff --git a/docs/en/benchmark_and_profiling.md b/docs/en/benchmark_and_profiling.md new file mode 100644 index 0000000000..3fbd935891 --- /dev/null +++ b/docs/en/benchmark_and_profiling.md @@ -0,0 +1,49 @@ +# Benchmark and Profiling + +## Benchmark +- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`. + ``` + python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32 + ``` +- Benchmark online serving. Launch a server first and run the following command. + ``` + python3 -m sglang.bench_serving --backend sglang --num-prompt 10 + ``` + +## Profile with Nsight +0. Prerequisite +```bash +# install nsys +# https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html +apt update +apt install -y --no-install-recommends gnupg +echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list +apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub +apt update +apt install nsight-systems-cli +``` + +1. To profile a single batch, use `nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512` + +2. To profile a server, e.g. + +```bash +# server +# set the delay and duration times according to needs +nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache + +# client +python3 -m sglang.bench_serving --backend sglang --num-prompts 6000 --dataset-name random --random-input 4096 --random-output 2048 +``` + +3. Use NVTX, e.g. + +```bash +# install nvtx +pip install nvtx + +# code snippets +import nvtx +with nvtx.annotate("description", color="color"): + # some critical code +``` \ No newline at end of file diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 0f9c882234..30a079e876 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -149,10 +149,12 @@ async def async_request_openai_completions( "completions" ), "OpenAI Completions API URL must end with 'completions'." + prompt = request_func_input.prompt + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: payload = { "model": request_func_input.model, - "prompt": request_func_input.prompt, + "prompt": prompt, "temperature": 0.0, "best_of": 1, "max_tokens": request_func_input.output_len, @@ -220,6 +222,13 @@ async def async_request_openai_completions( return output +async def async_request_ginfer( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + raise NotImplementedError() + + def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true": import huggingface_hub.constants @@ -238,6 +247,13 @@ def get_model(pretrained_model_name_or_path: str) -> str: def get_tokenizer( pretrained_model_name_or_path: str, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path.endswith( + ".json" + ) or pretrained_model_name_or_path.endswith(".model"): + from sglang.srt.hf_transformers_utils import get_tokenizer + + return get_tokenizer(pretrained_model_name_or_path) + if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path ): @@ -252,6 +268,7 @@ def get_tokenizer( "vllm": async_request_openai_completions, "lmdeploy": async_request_openai_completions, "trt": async_request_trt_llm, + "ginfer": async_request_ginfer, } @@ -351,9 +368,9 @@ def sample_sharegpt_requests( # Tokenize the prompts and completions. prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids + prompt_token_ids = tokenizer.encode(prompt) completion = dataset[i][1] - completion_token_ids = tokenizer(completion).input_ids + completion_token_ids = tokenizer.encode(completion) prompt_len = len(prompt_token_ids) output_len = ( len(completion_token_ids) if fixed_output_len is None else fixed_output_len @@ -361,7 +378,9 @@ def sample_sharegpt_requests( if prompt_len < 4 or output_len < 4: # Prune too short sequences. continue - if prompt_len > 1024 or prompt_len + output_len > 2048: + if prompt_len > 1024 or ( + prompt_len + output_len > 2048 and fixed_output_len is None + ): # Prune too long sequences. continue filtered_dataset.append((prompt, prompt_len, output_len)) @@ -422,7 +441,7 @@ def sample_random_requests( for i in range(num_prompts): # Tokenize the prompts and completions. prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids + prompt_token_ids = tokenizer.encode(prompt) prompt_len = len(prompt_token_ids) if prompt_len > input_lens[i]: @@ -488,7 +507,7 @@ def calculate_metrics( output_len = outputs[i].output_len output_lens.append(output_len) retokenized_output_len = len( - tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids + tokenizer.encode(outputs[i].generated_text, add_special_tokens=False) ) retokenized_output_lens.append(retokenized_output_len) total_input += input_requests[i][1] @@ -547,7 +566,6 @@ async def benchmark( input_requests: List[Tuple[str, int, int]], request_rate: float, disable_tqdm: bool, - enable_multi: bool, extra_request_body: Dict[str, Any], ): if backend in ASYNC_REQUEST_FUNCS: @@ -756,6 +774,7 @@ def run_benchmark(args_: argparse.Namespace): global args args = args_ + # Set global environments set_ulimit() random.seed(args.seed) np.random.seed(args.seed) @@ -764,12 +783,14 @@ def run_benchmark(args_: argparse.Namespace): if args.extra_request_body: extra_request_body = json.loads(args.extra_request_body) + # Set url if args.port is None: args.port = { "sglang": 30000, "lmdeploy": 23333, "vllm": 8000, "trt": 8000, + "ginfer": 9988, }.get(args.backend, 30000) api_url = ( @@ -792,7 +813,11 @@ def run_benchmark(args_: argparse.Namespace): if args.model is None: print("Please provide a model using `--model` when using `trt` backend.") sys.exit(1) + elif args.backend == "ginfer": + api_url = args.base_url if args.base_url else f"{args.host}:{args.port}" + args.model = args.model or "default" + # Get model name if args.model is None: try: response = requests.get(model_url) @@ -817,6 +842,7 @@ def run_benchmark(args_: argparse.Namespace): print(f"{args}\n") + # Read dataset backend = args.backend model_id = args.model tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model @@ -842,7 +868,21 @@ def run_benchmark(args_: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") - if args.multi: + if not args.multi: + return asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + request_rate=args.request_rate, + disable_tqdm=args.disable_tqdm, + extra_request_body=extra_request_body, + ) + ) + else: + # Benchmark multiple rps. TODO: use a fixed duration to compute num_prompts request_rates = parse_request_rate_range(args.request_rate_range) for rate in request_rates: @@ -855,27 +895,11 @@ def run_benchmark(args_: argparse.Namespace): input_requests=input_requests, request_rate=rate, disable_tqdm=args.disable_tqdm, - enable_multi=args.multi, extra_request_body=extra_request_body, ) ) - else: - return asyncio.run( - benchmark( - backend=backend, - api_url=api_url, - model_id=model_id, - tokenizer=tokenizer, - input_requests=input_requests, - request_rate=args.request_rate, - disable_tqdm=args.disable_tqdm, - enable_multi=args.multi, - extra_request_body=extra_request_body, - ) - ) -# to avoid relying on SGLang's components def set_ulimit(target_soft_limit=65535): resource_type = resource.RLIMIT_NOFILE current_soft, current_hard = resource.getrlimit(resource_type) @@ -968,7 +992,7 @@ def set_ulimit(target_soft_limit=65535): help="Number of requests per second. If this is inf, then all the requests are sent at time 0. " "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.", ) - parser.add_argument("--seed", type=int, default=0, help="Default is 0.") + parser.add_argument("--seed", type=int, default=1, help="The random seed.") parser.add_argument( "--multi", action="store_true", diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 508843a395..76a8c90439 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -30,7 +30,17 @@ PreTrainedTokenizer, PreTrainedTokenizerFast, ) -from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig + +try: + from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig + + _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + ChatGLMConfig.model_type: ChatGLMConfig, + DbrxConfig.model_type: DbrxConfig, + } +except ImportError: + # We want this file to run without vllm dependency + _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {} from sglang.srt.utils import is_multimodal_model diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 64bc4ea7cc..72fd54efe5 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -113,30 +113,7 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None): def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None): - import grpc - from ginfer import sampler_pb2, sampler_pb2_grpc - - sampler_channel = grpc.insecure_channel(url.replace("http://", "")) - sampler = sampler_pb2_grpc.SamplerStub(sampler_channel) - - if stop is None: - stop_strings = None - else: - stop_strings = [stop] - - sample_request = sampler_pb2.SampleTextRequest( - prompt=prompt, - settings=sampler_pb2.SampleSettings( - max_len=max_tokens, - rng_seed=0, - temperature=max(temperature, 1e-7), - nucleus_p=1, - stop_strings=stop_strings, - ), - ) - stream = sampler.SampleText(sample_request) - response = "".join([x.text for x in stream]) - return response + raise NotImplementedError() def call_generate_guidance( From 3c1f5a92200e112a07d467771af879942d2dd440 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 17 Aug 2024 18:03:00 -0700 Subject: [PATCH 049/118] Fix duplicated imports in hf_transformers_utils.py (#1141) --- python/sglang/bench_serving.py | 8 ++++---- python/sglang/srt/hf_transformers_utils.py | 5 ----- python/sglang/test/test_utils.py | 10 +++++----- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 30a079e876..e2a99f9fde 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -222,7 +222,7 @@ async def async_request_openai_completions( return output -async def async_request_ginfer( +async def async_request_gserver( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: @@ -268,7 +268,7 @@ def get_tokenizer( "vllm": async_request_openai_completions, "lmdeploy": async_request_openai_completions, "trt": async_request_trt_llm, - "ginfer": async_request_ginfer, + "gserver": async_request_gserver, } @@ -790,7 +790,7 @@ def run_benchmark(args_: argparse.Namespace): "lmdeploy": 23333, "vllm": 8000, "trt": 8000, - "ginfer": 9988, + "gserver": 9988, }.get(args.backend, 30000) api_url = ( @@ -813,7 +813,7 @@ def run_benchmark(args_: argparse.Namespace): if args.model is None: print("Please provide a model using `--model` when using `trt` backend.") sys.exit(1) - elif args.backend == "ginfer": + elif args.backend == "gserver": api_url = args.base_url if args.base_url else f"{args.host}:{args.port}" args.model = args.model or "default" diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 76a8c90439..fb198fd73c 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -44,11 +44,6 @@ from sglang.srt.utils import is_multimodal_model -_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { - ChatGLMConfig.model_type: ChatGLMConfig, - DbrxConfig.model_type: DbrxConfig, -} - def download_from_hf(model_path: str): if os.path.exists(model_path): diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 72fd54efe5..9f6aa68ab1 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -112,7 +112,7 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None): return pred -def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None): +def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None): raise NotImplementedError() @@ -256,7 +256,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser): "vllm", "outlines", "lightllm", - "ginfer", + "gserver", "guidance", "lmql", "srt-raw", @@ -277,7 +277,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser): "lightllm": 22000, "lmql": 23000, "srt-raw": 30000, - "ginfer": 9988, + "gserver": 9988, } args.port = default_port.get(args.backend, None) return args @@ -313,8 +313,8 @@ def _get_call_generate(args: argparse.Namespace): return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate") elif args.backend == "srt-raw": return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate") - elif args.backend == "ginfer": - return partial(call_generate_ginfer, url=f"{args.host}:{args.port}") + elif args.backend == "gserver": + return partial(call_generate_gserver, url=f"{args.host}:{args.port}") elif args.backend == "outlines": return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate") elif args.backend == "guidance": From fa13b95d6be5c246693492a1c7246cb112930252 Mon Sep 17 00:00:00 2001 From: min-xu-et <168487304+min-xu-et@users.noreply.github.com> Date: Sun, 18 Aug 2024 14:29:09 -0700 Subject: [PATCH 050/118] fixed a typo (#1143) --- python/sglang/bench_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index e2a99f9fde..69d175d843 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -990,7 +990,7 @@ def set_ulimit(target_soft_limit=65535): type=float, default=float("inf"), help="Number of requests per second. If this is inf, then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.", + "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.", ) parser.add_argument("--seed", type=int, default=1, help="The random seed.") parser.add_argument( From d8627ed16d403751e7cecbdc0563f26230c6ea25 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 18 Aug 2024 23:01:55 -0700 Subject: [PATCH 051/118] [Docs] Add instruction for running on clouds and kubernetes with SkyPilot (#1144) Co-authored-by: Zongheng Yang --- README.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/README.md b/README.md index 5434bb25f6..9ac4fbb308 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,46 @@ docker run --gpus all \ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine 2. Execute the command `docker compose up -d` in your terminal. +### Method 5: Run on Kubernetes or Clouds with SkyPilot + +To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot). + +1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +2. Deploy on your own infra with a single command and get the HTTP API endpoint: +
+SkyPilot YAML: sglang.yaml + +```yaml +# sglang.yaml +envs: + HF_TOKEN: null + +resources: + image_id: docker:lmsysorg/sglang:latest + accelerators: A100 + ports: 30000 + +run: | + conda deactivate + python3 -m sglang.launch_server \ + --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --host 0.0.0.0 \ + --port 30000 +``` + +
+ +```bash +# Deploy on any cloud or Kubernetes cluster. Use --cloud to select a specific cloud provider. +HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml + +# Get the HTTP API endpoint +sky status --endpoint 30000 sglang +``` +3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve). + + + ### Common Notes - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue. - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. From b997a18d74213e905052c47941eebefd36a4d276 Mon Sep 17 00:00:00 2001 From: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com> Date: Sun, 18 Aug 2024 23:45:41 -0700 Subject: [PATCH 052/118] [Feat]Add support for optional start len of logprobs (#1035) Co-authored-by: Ying Sheng Co-authored-by: Yineng Zhang Co-authored-by: Lianmin Zheng Co-authored-by: Liangsheng Yin --- python/sglang/srt/layers/logits_processor.py | 61 +++++++++++++++---- python/sglang/srt/managers/io_struct.py | 4 +- .../sglang/srt/managers/tokenizer_manager.py | 7 +++ .../srt/model_executor/forward_batch_info.py | 18 +++++- python/sglang/srt/openai_api/adapter.py | 42 ++++++++++--- python/sglang/srt/server.py | 2 + python/sglang/test/runners.py | 1 + test/srt/test_openai_server.py | 7 +-- 8 files changed, 112 insertions(+), 30 deletions(-) diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index 2e0ce6d5c7..a5ba06de02 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -55,6 +55,9 @@ class LogitsMetadata: extend_start_loc: Optional[torch.Tensor] = None top_logprobs_nums: Optional[List[int]] = None + extend_seq_lens_cpu: List[int] = None + logprob_start_lens_cpu: List[int] = None + @classmethod def from_input_metadata(cls, input_metadata: InputMetadata): return cls( @@ -63,6 +66,8 @@ def from_input_metadata(cls, input_metadata: InputMetadata): extend_start_loc=input_metadata.extend_start_loc, return_logprob=input_metadata.return_logprob, top_logprobs_nums=input_metadata.top_logprobs_nums, + extend_seq_lens_cpu=input_metadata.extend_seq_lens_cpu, + logprob_start_lens_cpu=input_metadata.logprob_start_lens_cpu, ) @@ -75,12 +80,16 @@ def __init__(self, config, skip_all_gather: bool = False): ) def _get_normalized_prompt_logprobs( - self, input_token_logprobs, logits_metadata: LogitsMetadata + self, + input_token_logprobs: torch.Tensor, + cum_start_len0: torch.Tensor, + cum_start_len1: torch.Tensor, + logits_metadata: LogitsMetadata, ): logprobs_cumsum = torch.cumsum(input_token_logprobs, dim=0, dtype=torch.float32) - start = logits_metadata.extend_start_loc.clone() - end = start + logits_metadata.extend_seq_lens - 2 + start = logits_metadata.extend_start_loc.clone() - cum_start_len0 + end = start + logits_metadata.extend_seq_lens - 2 - cum_start_len1 start.clamp_(min=0, max=input_token_logprobs.shape[0] - 1) end.clamp_(min=0, max=input_token_logprobs.shape[0] - 1) sum_logp = ( @@ -93,7 +102,7 @@ def _get_normalized_prompt_logprobs( return normalized_prompt_logprobs @staticmethod - def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata): + def get_top_logprobs(all_logprobs: torch.Tensor, logits_metadata: LogitsMetadata): if logits_metadata.forward_mode == ForwardMode.DECODE: output_top_logprobs = [] max_k = max(logits_metadata.top_logprobs_nums) @@ -107,7 +116,7 @@ def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata): # TODO: vectorize the code below input_top_logprobs, output_top_logprobs = [], [] pt = 0 - extend_seq_lens_cpu = logits_metadata.extend_seq_lens.tolist() + extend_seq_lens_cpu = logits_metadata.extend_seq_lens_cpu max_k = max(logits_metadata.top_logprobs_nums) ret = all_logprobs.topk(max_k, dim=1) @@ -115,26 +124,30 @@ def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata): indices = ret.indices.tolist() for i, extend_seq_len in enumerate(extend_seq_lens_cpu): + start_len = logits_metadata.logprob_start_lens_cpu[i] + pruned_len = extend_seq_len - start_len + if extend_seq_len == 0: input_top_logprobs.append([]) output_top_logprobs.append([]) continue + k = logits_metadata.top_logprobs_nums[i] input_top_logprobs.append( [ list(zip(values[pt + j][:k], indices[pt + j][:k])) - for j in range(extend_seq_len - 1) + for j in range(pruned_len - 1) ] ) output_top_logprobs.append( list( zip( - values[pt + extend_seq_len - 1][:k], - indices[pt + extend_seq_len - 1][:k], + values[pt + pruned_len - 1][:k], + indices[pt + pruned_len - 1][:k], ) ) ) - pt += extend_seq_len + pt += pruned_len return input_top_logprobs, output_top_logprobs @@ -205,7 +218,23 @@ def forward( output_top_logprobs=output_top_logprobs, ) else: - all_logits = torch.matmul(hidden_states, weight.T) + pt, states, pruned_input_ids = 0, [], [] + for i, extend_len in enumerate(logits_metadata.extend_seq_lens_cpu): + start_len = logits_metadata.logprob_start_lens_cpu[i] + states.append(hidden_states[pt + start_len : pt + extend_len]) + pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len]) + pt += extend_len + + states = torch.cat(states, dim=0) + pruned_input_ids = torch.cat(pruned_input_ids, dim=0) + + cum_start_len1 = torch.tensor( + logits_metadata.logprob_start_lens_cpu, device="cuda" + ).cumsum(0) + cum_start_len0 = torch.zeros_like(cum_start_len1) + cum_start_len0[1:] = cum_start_len1[:-1] + + all_logits = torch.matmul(states, weight.T) if self.do_tensor_parallel_all_gather: all_logits = tensor_model_parallel_all_gather(all_logits) all_logits = all_logits[:, : self.config.vocab_size].float() @@ -230,19 +259,25 @@ def forward( else: input_top_logprobs = output_top_logprobs = None - last_logprobs = all_logprobs[last_index] + last_logprobs = all_logprobs[last_index - cum_start_len1] # Compute the logprobs and normalized logprobs for the prefill tokens. # Note that we pad a zero at the end of each sequence for easy computation. input_token_logprobs = all_logprobs[ torch.arange(all_logprobs.shape[0], device="cuda"), - torch.cat([input_ids[1:], torch.tensor([0], device="cuda")]), + torch.cat([pruned_input_ids[1:], torch.tensor([0], device="cuda")]), ] normalized_prompt_logprobs = self._get_normalized_prompt_logprobs( - input_token_logprobs, logits_metadata + input_token_logprobs, + cum_start_len0, + cum_start_len1, + logits_metadata, ) + # Remove the last token logprob for the prefill tokens. + input_token_logprobs = input_token_logprobs[:-1] + return LogitProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 82f280b606..3a0ecd8f6c 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -75,7 +75,7 @@ def post_init(self): if self.return_logprob is None: self.return_logprob = False if self.logprob_start_len is None: - self.logprob_start_len = 0 + self.logprob_start_len = -1 if self.top_logprobs_num is None: self.top_logprobs_num = 0 else: @@ -141,7 +141,7 @@ def post_init(self): self.return_logprob = [self.return_logprob] * num if self.logprob_start_len is None: - self.logprob_start_len = [0] * num + self.logprob_start_len = [-1] * num elif not isinstance(self.logprob_start_len, list): self.logprob_start_len = [self.logprob_start_len] * num diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index d5fbfe05d3..edbfff3ec8 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -195,6 +195,9 @@ async def _handle_single_request( if not_use_index else obj.logprob_start_len[index] ) + if return_logprob and logprob_start_len == -1: + logprob_start_len = len(input_ids) - 1 + top_logprobs_num = ( obj.top_logprobs_num if not_use_index @@ -245,6 +248,8 @@ async def _handle_single_request( top_logprobs_num = obj.top_logprobs_num[0] if self.is_generation: + if return_logprob and logprob_start_len == -1: + logprob_start_len = len(input_ids) - 1 tokenized_obj = TokenizedGenerateReqInput( rid, input_text, @@ -334,6 +339,8 @@ async def _handle_batch_request( sampling_params = self._get_sampling_params(obj.sampling_params[index]) if self.is_generation: + if obj.return_logprob[index] and obj.logprob_start_len[index] == -1: + obj.logprob_start_len[index] = len(input_ids) - 1 pixel_values, image_hash, image_size = await self._get_pixel_values( obj.image_data[index] ) diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 3cf68eab24..bac0a05378 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -61,9 +61,11 @@ class InputMetadata: extend_start_loc: torch.Tensor = None extend_no_prefix: bool = None - # Output options + # For logprob return_logprob: bool = False top_logprobs_nums: List[int] = None + extend_seq_lens_cpu: List[int] = None + logprob_start_lens_cpu: List[int] = None # For multimodal pixel_values: List[torch.Tensor] = None @@ -139,6 +141,7 @@ def compute_positions(self, batch: ScheduleBatch): def compute_extend_infos(self, batch: ScheduleBatch): if self.forward_mode == ForwardMode.DECODE: self.extend_seq_lens = self.extend_start_loc = self.extend_no_prefix = None + self.extend_seq_lens_cpu = self.logprob_start_lens_cpu = None else: extend_lens_cpu = [ len(r.fill_ids) - batch.prefix_lens_cpu[i] @@ -149,6 +152,19 @@ def compute_extend_infos(self, batch: ScheduleBatch): self.extend_start_loc[1:] = torch.cumsum(self.extend_seq_lens[:-1], dim=0) self.extend_no_prefix = all(l == 0 for l in batch.prefix_lens_cpu) + self.extend_seq_lens_cpu = extend_lens_cpu + self.logprob_start_lens_cpu = [ + ( + min( + req.logprob_start_len - batch.prefix_lens_cpu[i], + extend_lens_cpu[i] - 1, + ) + if req.logprob_start_len >= batch.prefix_lens_cpu[i] + else extend_lens_cpu[i] - 1 # Fake extend, actually decode + ) + for i, req in enumerate(batch.reqs) + ] + @classmethod def from_schedule_batch( cls, diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 15aa701cb0..5d7bb7af7d 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -20,6 +20,7 @@ import os import time import uuid +import warnings from http import HTTPStatus from typing import Dict, List, Optional @@ -383,20 +384,33 @@ def iter_file(): return StreamingResponse(iter_file(), media_type="application/octet-stream") -def v1_generate_request(all_requests): +def v1_generate_request(all_requests: List[CompletionRequest]): prompts = [] sampling_params_list = [] return_logprobs = [] + logprob_start_lens = [] top_logprobs_nums = [] - first_prompt_type = type(all_requests[0].prompt) + # NOTE: with openai API, the prompt's logprobs are always not computed + first_prompt_type = type(all_requests[0].prompt) for request in all_requests: - prompt = request.prompt assert ( - type(prompt) == first_prompt_type + type(request.prompt) == first_prompt_type ), "All prompts must be of the same type in file input settings" - prompts.append(prompt) + if len(all_requests) > 1 and request.n > 1: + raise ValueError( + "Parallel sampling is not supported for completions from files" + ) + if request.echo and request.logprobs: + warnings.warn( + "Echo is not compatible with logprobs. " + "To compute logprobs of input prompt, please use SGLang /request API." + ) + + for request in all_requests: + prompts.append(request.prompt) return_logprobs.append(request.logprobs is not None and request.logprobs > 0) + logprob_start_lens.append(-1) top_logprobs_nums.append( request.logprobs if request.logprobs is not None else 0 ) @@ -416,14 +430,11 @@ def v1_generate_request(all_requests): "ignore_eos": request.ignore_eos, } ) - if len(all_requests) > 1 and request.n > 1: - raise ValueError( - "Parallel sampling is not supported for completions from files" - ) if len(all_requests) == 1: prompt = prompts[0] sampling_params_list = sampling_params_list[0] + logprob_start_lens = logprob_start_lens[0] return_logprobs = return_logprobs[0] top_logprobs_nums = top_logprobs_nums[0] if isinstance(prompt, str) or isinstance(prompt[0], str): @@ -441,6 +452,7 @@ def v1_generate_request(all_requests): sampling_params=sampling_params_list, return_logprob=return_logprobs, top_logprobs_num=top_logprobs_nums, + logprob_start_len=logprob_start_lens, return_text_in_logprobs=True, stream=all_requests[0].stream, ) @@ -694,12 +706,18 @@ async def generate_stream_resp(): return response -def v1_chat_generate_request(all_requests, tokenizer_manager): +def v1_chat_generate_request( + all_requests: List[ChatCompletionRequest], tokenizer_manager +): input_ids = [] sampling_params_list = [] image_data_list = [] return_logprobs = [] + logprob_start_lens = [] top_logprobs_nums = [] + + # NOTE: with openai API, the prompt's logprobs are always not computed + for request in all_requests: # Prep the data needed for the underlying GenerateReqInput: # - prompt: The full prompt string. @@ -732,6 +750,7 @@ def v1_chat_generate_request(all_requests, tokenizer_manager): image_data = None input_ids.append(prompt_ids) return_logprobs.append(request.logprobs) + logprob_start_lens.append(-1) top_logprobs_nums.append(request.top_logprobs) sampling_params_list.append( { @@ -758,17 +777,20 @@ def v1_chat_generate_request(all_requests, tokenizer_manager): sampling_params_list = sampling_params_list[0] image_data = image_data_list[0] return_logprobs = return_logprobs[0] + logprob_start_lens = logprob_start_lens[0] top_logprobs_nums = top_logprobs_nums[0] else: if isinstance(input_ids[0], str): prompt_kwargs = {"text": input_ids} else: prompt_kwargs = {"input_ids": input_ids} + adapted_request = GenerateReqInput( **prompt_kwargs, image_data=image_data, sampling_params=sampling_params_list, return_logprob=return_logprobs, + logprob_start_len=logprob_start_lens, top_logprobs_num=top_logprobs_nums, stream=all_requests[0].stream, return_text_in_logprobs=True, diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 9028c12309..997b805cc8 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -559,12 +559,14 @@ def generate( prompt: str, sampling_params: Optional[Dict] = None, return_logprob: Optional[Union[List[bool], bool]] = False, + logprob_start_len: Optional[Union[List[int], int]] = None, top_logprobs_num: Optional[Union[List[int], int]] = None, ): json_data = { "text": prompt, "sampling_params": sampling_params, "return_logprob": return_logprob, + "logprob_start_len": logprob_start_len, "top_logprobs_num": top_logprobs_num, } response = requests.post( diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index e325ecb710..9386d7f7af 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -209,6 +209,7 @@ def forward( prompt, sampling_params=sampling_params, return_logprob=True, + logprob_start_len=0, top_logprobs_num=NUM_TOP_LOGPROBS, ) response = json.loads(response) diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 8724247564..c62fefe9f0 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -70,13 +70,12 @@ def run_completion( assert isinstance(response.choices[0].logprobs.tokens[0], str) assert isinstance(response.choices[0].logprobs.top_logprobs[1], dict) ret_num_top_logprobs = len(response.choices[0].logprobs.top_logprobs[1]) + # FIXME: Sometimes, some top_logprobs are missing in the return value. The reason is that some out_put id maps to the same output token and duplicate in the map # assert ret_num_top_logprobs == logprobs, f"{ret_num_top_logprobs} vs {logprobs}" + assert ret_num_top_logprobs > 0 - if echo: - assert response.choices[0].logprobs.token_logprobs[0] == None - else: - assert response.choices[0].logprobs.token_logprobs[0] != None + assert response.choices[0].logprobs.token_logprobs[0] != None assert response.id assert response.created From df191254abc002b3284560d9c4b94214a4656265 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Mon, 19 Aug 2024 18:23:07 +0800 Subject: [PATCH 053/118] Optimize MLA/GQA/MQA Triton decoding (#1138) Co-authored-by: Yineng Zhang --- python/sglang/srt/layers/decode_attention.py | 386 ++++++++++++++++--- 1 file changed, 337 insertions(+), 49 deletions(-) diff --git a/python/sglang/srt/layers/decode_attention.py b/python/sglang/srt/layers/decode_attention.py index c868299ef4..eef3c00096 100644 --- a/python/sglang/srt/layers/decode_attention.py +++ b/python/sglang/srt/layers/decode_attention.py @@ -58,7 +58,6 @@ def _fwd_kernel_stage1( att_stride_h, kv_group_num: tl.constexpr, BLOCK_DMODEL: tl.constexpr, - BLOCK_DPE: tl.constexpr, BLOCK_N: tl.constexpr, logit_cap: tl.constexpr, ): @@ -78,10 +77,6 @@ def _fwd_kernel_stage1( off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d - if BLOCK_DPE > 0: - offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE) - off_qpe = cur_batch * stride_qbs + cur_head * stride_qh + offs_dpe - offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N) block_stard_index = start_n * BLOCK_N @@ -106,19 +101,6 @@ def _fwd_kernel_stage1( other=0.0, ).to(REDUCE_TRITON_TYPE) att_value = tl.sum(q[None, :] * k, 1) - if BLOCK_DPE > 0: - qpe = tl.load(Q + off_qpe + start_mark).to(REDUCE_TRITON_TYPE) - offs_buf_kpe = ( - k_loc[:, None] * stride_buf_kbs - + cur_kv_head * stride_buf_kh - + offs_dpe[None, :] - ) - kpe = tl.load( - K_Buffer + offs_buf_kpe, - mask=offs_n_new[:, None] < cur_batch_end_index, - other=0.0, - ).to(REDUCE_TRITON_TYPE) - att_value += tl.sum(qpe[None, :] * kpe, 1) att_value *= sm_scale if logit_cap > 0: @@ -214,14 +196,7 @@ def _decode_att_m_fwd( # shape constraints Lq, Lk = q.shape[-1], k_buffer.shape[-1] assert Lq == Lk - assert Lk in {16, 32, 64, 128, 256, 576} - - if Lk == 576: - BLOCK_DMODEL = 512 - BLOCK_DPE = 64 - else: - BLOCK_DMODEL = Lk - BLOCK_DPE = 0 + assert Lk in {16, 32, 64, 128, 256} batch, head_num = B_req_idx.shape[0], q.shape[1] @@ -249,8 +224,7 @@ def _decode_att_m_fwd( k_buffer.stride(1), att_out.stride(0), kv_group_num=kv_group_num, - BLOCK_DMODEL=BLOCK_DMODEL, - BLOCK_DPE=BLOCK_DPE, + BLOCK_DMODEL=Lk, BLOCK_N=BLOCK, logit_cap=logit_cap, num_warps=num_warps, @@ -296,6 +270,293 @@ def _decode_softmax_reducev_fwd( ) +@triton.jit +def _fwd_grouped_kernel_stage1( + Q, + K_Buffer, + sm_scale, + Req_to_tokens, + B_req_idx, + B_Start_Loc, + B_Seqlen, + Att_Out, + stride_req_to_tokens_b, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + att_stride_h, + kv_group_num: tl.constexpr, + q_head_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DPE: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_H: tl.constexpr, + logit_cap: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_kv_head = tl.program_id(1) + start_n = tl.program_id(2) + + cur_head = cur_kv_head * kv_group_num + tl.arange(0, BLOCK_H) + mask_h = cur_head < (cur_kv_head + 1) * kv_group_num + mask_h = mask_h & (cur_head < q_head_num) + + offs_d = tl.arange(0, BLOCK_DMODEL) + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + cur_batch_req_idx = tl.load(B_req_idx + cur_batch) + + cur_batch_start_index = 0 + cur_batch_end_index = cur_batch_seq_len + + offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :] + + if BLOCK_DPE > 0: + offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE) + off_qpe = ( + cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :] + ) + + offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N) + + block_stard_index = start_n * BLOCK_N + block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0) + + for start_mark in range(0, block_mask, 1): + q = tl.load(Q + offs_q + start_mark, mask=mask_h[:, None]).to( + REDUCE_TRITON_TYPE + ) + offs_n_new = cur_batch_start_index + offs_n + k_loc = tl.load( + Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new, + mask=offs_n_new < cur_batch_end_index, + other=0, + ) + offs_buf_k = ( + k_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_d[:, None] + ) + k = tl.load( + K_Buffer + offs_buf_k, + mask=offs_n_new[None, :] < cur_batch_end_index, + other=0.0, + ).to(REDUCE_TRITON_TYPE) + qk = tl.dot(q, k) + if BLOCK_DPE > 0: + qpe = tl.load(Q + off_qpe + start_mark, mask=mask_h[:, None]).to( + REDUCE_TRITON_TYPE + ) + offs_buf_kpe = ( + k_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_dpe[:, None] + ) + kpe = tl.load( + K_Buffer + offs_buf_kpe, + mask=offs_n_new[None, :] < cur_batch_end_index, + other=0.0, + ).to(REDUCE_TRITON_TYPE) + qk += tl.dot(qpe, kpe) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + offs_o = cur_head[:, None] * att_stride_h + ( + cur_batch_in_all_start_index + offs_n[None, :] + ) + + tl.store( + Att_Out + offs_o, + qk, + mask=mask_h[:, None] & (offs_n_new[None, :] < cur_batch_end_index), + ) + + +@triton.jit +def _fwd_grouped_kernel_stage2( + Logics, + V_Buffer, + Out, + Req_to_tokens, + B_req_idx, + B_Start_Loc, + B_Seqlen, + stride_logic_h, + stride_buf_vbs, + stride_buf_vh, + stride_obs, + stride_oh, + stride_req_to_token_b, + kv_group_num: tl.constexpr, + q_head_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_H: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_kv_head = tl.program_id(1) + + cur_head = cur_kv_head * kv_group_num + tl.arange(0, BLOCK_H) + mask_h = cur_head < (cur_kv_head + 1) * kv_group_num + mask_h = mask_h & (cur_head < q_head_num) + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_start_loc = tl.load(B_Start_Loc + cur_batch) + cur_batch_req_idx = tl.load(B_req_idx + cur_batch) + + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + + offs_buf_v = cur_kv_head * stride_buf_vh + offs_d[None, :] + v_ptrs = V_Buffer + offs_buf_v + + e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf") + e_sum = tl.zeros([BLOCK_H], dtype=tl.float32) + acc = tl.zeros([BLOCK_H, BLOCK_DMODEL], dtype=tl.float32) + + for start_n in range(0, cur_batch_seq_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + v_index = tl.load( + Req_to_tokens + + cur_batch_req_idx * stride_req_to_token_b + + (start_n + offs_n), + mask=(start_n + offs_n) < cur_batch_seq_len, + other=0, + ) + + offs_qk = cur_head[:, None] * stride_logic_h + ( + cur_batch_start_loc + start_n + offs_n[None, :] + ) + + qk = tl.load( + Logics + offs_qk, + mask=mask_h[:, None] & (start_n + offs_n[None, :] < cur_batch_seq_len), + other=float("-inf"), + ) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + old_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + e_sum = e_sum * old_scale + tl.sum(p, 1) + v = tl.load(v_ptrs + v_index[:, None] * stride_buf_vbs) + p = p.to(v.dtype) + acc = acc * old_scale[:, None] + tl.dot(p, v) + e_max = n_e_max + + acc = acc / e_sum[:, None] + off_o = cur_batch * stride_obs + cur_head[:, None] * stride_oh + offs_d[None, :] + out_ptrs = Out + off_o + tl.store(out_ptrs, acc, mask=mask_h[:, None]) + + +def _decode_grouped_att_m_fwd( + q, + k_buffer, + att_out, + Req_to_tokens, + B_req_idx, + B_Start_Loc, + B_Seqlen, + max_len_in_batch, + sm_scale, + logit_cap, +): + BLOCK = 32 + # shape constraints + Lq, Lk = q.shape[-1], k_buffer.shape[-1] + assert Lq == Lk + assert Lk in {16, 32, 64, 128, 256, 576} + + if Lk == 576: + BLOCK_DMODEL = 512 + BLOCK_DPE = 64 + else: + BLOCK_DMODEL = Lk + BLOCK_DPE = 0 + + batch, head_num = B_req_idx.shape[0], q.shape[1] + kv_group_num = q.shape[1] // k_buffer.shape[1] + + BLOCK_H = max(16, triton.next_power_of_2(kv_group_num)) + grid = ( + batch, + triton.cdiv(head_num, min(BLOCK_H, kv_group_num)), + triton.cdiv(max_len_in_batch, BLOCK), + ) + + num_warps = 4 + + _fwd_grouped_kernel_stage1[grid]( + q, + k_buffer, + sm_scale, + Req_to_tokens, + B_req_idx, + B_Start_Loc, + B_Seqlen, + att_out, + Req_to_tokens.stride(0), + q.stride(0), + q.stride(1), + k_buffer.stride(0), + k_buffer.stride(1), + att_out.stride(0), + kv_group_num=kv_group_num, + q_head_num=head_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DPE=BLOCK_DPE, + BLOCK_N=BLOCK, + BLOCK_H=BLOCK_H, + logit_cap=logit_cap, + num_warps=num_warps, + num_stages=1, + ) + + +def _decode_grouped_softmax_reducev_fwd( + logics, + v_buffer, + o, + req_to_tokens, + b_req_idx, + b_start_loc, + b_seq_len, +): + BLOCK = 128 + batch, head_num = b_seq_len.shape[0], logics.shape[0] + kv_group_num = logics.shape[0] // v_buffer.shape[1] + BLOCK_H = max(16, triton.next_power_of_2(kv_group_num)) + grid = (batch, triton.cdiv(head_num, min(BLOCK_H, kv_group_num)), 1) + + num_warps = 8 + + _fwd_grouped_kernel_stage2[grid]( + logics, + v_buffer, + o, + req_to_tokens, + b_req_idx, + b_start_loc, + b_seq_len, + logics.stride(0), + v_buffer.stride(0), + v_buffer.stride(1), + o.stride(0), + o.stride(1), + req_to_tokens.stride(0), + kv_group_num=kv_group_num, + q_head_num=head_num, + BLOCK_DMODEL=v_buffer.shape[-1], + BLOCK_N=BLOCK, + BLOCK_H=BLOCK_H, + num_warps=num_warps, + num_stages=1, + ) + + def decode_attention_fwd( q, k_buffer, @@ -316,24 +577,51 @@ def decode_attention_fwd( (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda" ) - _decode_att_m_fwd( - q, - k_buffer, - att_m, - req_to_token, - b_req_idx, - b_start_loc, - b_seq_len, - max_len_in_batch, - sm_scale, - logit_cap, - ) - _decode_softmax_reducev_fwd( - att_m, - v_buffer, - o, - req_to_token, - b_req_idx, - b_start_loc, - b_seq_len, - ) + kv_group_num = q.shape[1] // v_buffer.shape[1] + + if kv_group_num == 1: + # MHA + _decode_att_m_fwd( + q, + k_buffer, + att_m, + req_to_token, + b_req_idx, + b_start_loc, + b_seq_len, + max_len_in_batch, + sm_scale, + logit_cap, + ) + _decode_softmax_reducev_fwd( + att_m, + v_buffer, + o, + req_to_token, + b_req_idx, + b_start_loc, + b_seq_len, + ) + else: + # GQA/MQA/MLA + _decode_grouped_att_m_fwd( + q, + k_buffer, + att_m, + req_to_token, + b_req_idx, + b_start_loc, + b_seq_len, + max_len_in_batch, + sm_scale, + logit_cap, + ) + _decode_grouped_softmax_reducev_fwd( + att_m, + v_buffer, + o, + req_to_token, + b_req_idx, + b_start_loc, + b_seq_len, + ) From d8476818efc88188d0aa0a8a176024a0b82e7a1d Mon Sep 17 00:00:00 2001 From: Juwan Yoo Date: Tue, 20 Aug 2024 08:06:55 -0700 Subject: [PATCH 054/118] feat: allow streaming for multi-prompt and/or parallel sampling (#1134) --- .../sglang/srt/managers/tokenizer_manager.py | 93 ++++++++------- python/sglang/srt/openai_api/adapter.py | 106 +++++++++++++----- test/srt/test_openai_server.py | 75 ++++++++++--- test/srt/test_srt_endpoint.py | 21 +++- 4 files changed, 210 insertions(+), 85 deletions(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index edbfff3ec8..e157217e34 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -153,9 +153,6 @@ async def generate_request( async for response in self._handle_single_request(obj, request): yield response else: - if hasattr(obj, "stream") and obj.stream: - raise ValueError("Do not support stream for batch mode.") - async for response in self._handle_batch_request(obj, request): yield response @@ -311,6 +308,7 @@ async def _handle_batch_request( parallel_sample_num = 1 # First send out all requests + generators = [] for i in range(batch_size): for j in range(parallel_sample_num): if j == 0 and parallel_sample_num != 1: @@ -371,42 +369,48 @@ async def _handle_batch_request( state = ReqState([], False, event) self.rid_to_state[rid] = state - # Then wait for all responses + generators.append( + self._wait_for_response( + event, + state, + obj, + rid, + request, + index=index, + response_index=len(generators), + ) + ) + + # Then process the responses based on streaming option + + is_stream = hasattr(obj, "stream") and obj.stream + + tasks = [asyncio.create_task(gen.__anext__()) for gen in generators] output_list = [] - for i in range(batch_size): - for j in range(parallel_sample_num): - if j == 0 and parallel_sample_num != 1: - continue - index = i * parallel_sample_num + j - if parallel_sample_num != 1: - index += batch_size - 1 - i - rid = obj.rid[index] - state = self.rid_to_state[rid] - - while True: - try: - await asyncio.wait_for(state.event.wait(), timeout=4) - break - except asyncio.TimeoutError: - if request is not None and await request.is_disconnected(): - for rid in obj.rid: - self.abort_request(rid) - raise ValueError(f"Abort request {rid}") - continue - if self.is_generation: - output_list.append( - self.convert_logprob_style( - state.out_list[-1], - obj.return_logprob[index], - obj.top_logprobs_num[index], - obj.return_text_in_logprobs, - ) + + while tasks: + done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + + for task in done: + gen_index = tasks.index(task) + + try: + result = task.result() + + if is_stream: + yield result + else: + output_list.append(result) + + tasks[gen_index] = asyncio.create_task( + generators[gen_index].__anext__() ) - else: - output_list.append(state.out_list[-1]) - assert state.finished - del self.rid_to_state[rid] - yield output_list + except StopAsyncIteration: + del generators[gen_index] + del tasks[gen_index] + + if not is_stream: + yield output_list def _validate_input_length(self, input_ids: List[int]): if len(input_ids) >= self.context_len: @@ -437,26 +441,35 @@ async def _wait_for_response( obj: Union[GenerateReqInput, EmbeddingReqInput], rid: str, request, + index: int = None, + response_index: int = 0, ): while True: try: await asyncio.wait_for(event.wait(), timeout=4) except asyncio.TimeoutError: if request is not None and await request.is_disconnected(): - self.abort_request(rid) + for rid in [obj.rid] if obj.is_single else obj.rid: + self.abort_request(rid) raise ValueError(f"Abort request {rid}") continue if self.is_generation: out = self.convert_logprob_style( state.out_list[-1], - obj.return_logprob, - obj.top_logprobs_num, + obj.return_logprob if index is None else obj.return_logprob[index], + ( + obj.top_logprobs_num + if index is None + else obj.top_logprobs_num[index] + ), obj.return_text_in_logprobs, ) else: # isinstance(obj, EmbeddingReqInput) out = state.out_list[-1] + out["index"] = response_index + # Log requests if self.server_args.log_requests and state.finished: if obj.text is None: diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 5d7bb7af7d..12b40d6c40 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -277,6 +277,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe request_data = json.loads(line) file_request_list.append(request_data) body = request_data["body"] + + # Although streaming is supported for standalone completions, it is not supported in + # batch mode (multiple completions in single request). + if body.get("stream", False): + raise ValueError("Streaming requests are not supported in batch mode") + if end_point == "/v1/chat/completions": all_requests.append(ChatCompletionRequest(**body)) elif end_point == "/v1/completions": @@ -592,27 +598,45 @@ async def v1_completions(tokenizer_manager, raw_request: Request): if adapted_request.stream: async def generate_stream_resp(): - stream_buffer = "" - n_prev_token = 0 + stream_buffers = {} + n_prev_tokens = {} + prompt_tokens = {} + completion_tokens = {} try: async for content in tokenizer_manager.generate_request( adapted_request, raw_request ): + index = content["index"] + + stream_buffer = stream_buffers.get(index, "") + n_prev_token = n_prev_tokens.get(index, 0) + text = content["text"] - prompt_tokens = content["meta_info"]["prompt_tokens"] - completion_tokens = content["meta_info"]["completion_tokens"] + prompt_tokens[index] = content["meta_info"]["prompt_tokens"] + completion_tokens[index] = content["meta_info"]["completion_tokens"] if not stream_buffer: # The first chunk if request.echo: if isinstance(request.prompt, str): # for the case of single str prompts prompts = request.prompt - elif isinstance(request.prompt, list) and isinstance( - request.prompt[0], int - ): - prompts = tokenizer_manager.tokenizer.decode( - request.prompt, skip_special_tokens=True - ) + elif isinstance(request.prompt, list): + if isinstance(request.prompt[0], str): + # for the case of multiple str prompts + prompts = request.prompt[index // request.n] + elif isinstance(request.prompt[0], int): + # for the case of single token ids prompt + prompts = tokenizer_manager.tokenizer.decode( + request.prompt, skip_special_tokens=True + ) + elif isinstance(request.prompt[0], list) and isinstance( + request.prompt[0][0], int + ): + # for the case of multiple token ids prompts + prompts = tokenizer_manager.tokenizer.decode( + request.prompt[index // request.n], + skip_special_tokens=True, + ) # Prepend prompt in response text. text = prompts + text @@ -649,7 +673,7 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] stream_buffer = stream_buffer + delta choice_data = CompletionResponseStreamChoice( - index=0, + index=index, text=delta, logprobs=logprobs, finish_reason=format_finish_reason( @@ -662,12 +686,24 @@ async def generate_stream_resp(): choices=[choice_data], model=request.model, ) + + stream_buffers[index] = stream_buffer + n_prev_tokens[index] = n_prev_token + yield f"data: {chunk.model_dump_json()}\n\n" if request.stream_options and request.stream_options.include_usage: + total_prompt_tokens = sum( + tokens + for i, tokens in prompt_tokens.items() + if i % request.n == 0 + ) + total_completion_tokens = sum( + tokens for tokens in completion_tokens.values() + ) usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens, ) final_usage_chunk = CompletionStreamResponse( @@ -914,16 +950,23 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request): if adapted_request.stream: async def generate_stream_resp(): - is_first = True - - stream_buffer = "" - n_prev_token = 0 + is_firsts = {} + stream_buffers = {} + n_prev_tokens = {} + prompt_tokens = {} + completion_tokens = {} try: async for content in tokenizer_manager.generate_request( adapted_request, raw_request ): - prompt_tokens = content["meta_info"]["prompt_tokens"] - completion_tokens = content["meta_info"]["completion_tokens"] + index = content["index"] + + is_first = is_firsts.get(index, True) + stream_buffer = stream_buffers.get(index, "") + n_prev_token = n_prev_tokens.get(index, 0) + + prompt_tokens[index] = content["meta_info"]["prompt_tokens"] + completion_tokens[index] = content["meta_info"]["completion_tokens"] if request.logprobs: logprobs = to_openai_style_logprobs( output_token_logprobs=content["meta_info"][ @@ -973,7 +1016,7 @@ async def generate_stream_resp(): # First chunk with role is_first = False choice_data = ChatCompletionResponseStreamChoice( - index=0, + index=index, delta=DeltaMessage(role="assistant"), finish_reason=format_finish_reason( content["meta_info"]["finish_reason"] @@ -991,7 +1034,7 @@ async def generate_stream_resp(): delta = text[len(stream_buffer) :] stream_buffer = stream_buffer + delta choice_data = ChatCompletionResponseStreamChoice( - index=0, + index=index, delta=DeltaMessage(content=delta), finish_reason=format_finish_reason( content["meta_info"]["finish_reason"] @@ -1003,12 +1046,25 @@ async def generate_stream_resp(): choices=[choice_data], model=request.model, ) + + is_firsts[index] = is_first + stream_buffers[index] = stream_buffer + n_prev_tokens[index] = n_prev_token + yield f"data: {chunk.model_dump_json()}\n\n" if request.stream_options and request.stream_options.include_usage: + total_prompt_tokens = sum( + tokens + for i, tokens in prompt_tokens.items() + if i % request.n == 0 + ) + total_completion_tokens = sum( + tokens for tokens in completion_tokens.values() + ) usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens, ) final_usage_chunk = ChatCompletionStreamResponse( diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index c62fefe9f0..828f5ab532 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -85,13 +85,26 @@ def run_completion( assert response.usage.completion_tokens > 0 assert response.usage.total_tokens > 0 - def run_completion_stream(self, echo, logprobs, token_input): + def run_completion_stream( + self, echo, logprobs, use_list_input, parallel_sample_num, token_input + ): client = openai.Client(api_key=self.api_key, base_url=self.base_url) prompt = "The capital of France is" if token_input: - prompt_arg = self.tokenizer.encode(prompt) + prompt_input = self.tokenizer.encode(prompt) + num_prompt_tokens = len(prompt_input) else: - prompt_arg = prompt + prompt_input = prompt + num_prompt_tokens = len(self.tokenizer.encode(prompt)) + + if use_list_input: + prompt_arg = [prompt_input, prompt_input] + num_choices = len(prompt_arg) + num_prompt_tokens *= 2 + else: + prompt_arg = prompt_input + num_choices = 1 + generator = client.completions.create( model=self.model, prompt=prompt_arg, @@ -101,9 +114,10 @@ def run_completion_stream(self, echo, logprobs, token_input): logprobs=logprobs, stream=True, stream_options={"include_usage": True}, + n=parallel_sample_num, ) - first = True + is_firsts = {} for response in generator: usage = response.usage if usage is not None: @@ -111,10 +125,14 @@ def run_completion_stream(self, echo, logprobs, token_input): assert usage.completion_tokens > 0 assert usage.total_tokens > 0 continue + + index = response.choices[0].index + is_first = is_firsts.get(index, True) + if logprobs: assert response.choices[0].logprobs assert isinstance(response.choices[0].logprobs.tokens[0], str) - if not (first and echo): + if not (is_first and echo): assert isinstance( response.choices[0].logprobs.top_logprobs[0], dict ) @@ -125,15 +143,20 @@ def run_completion_stream(self, echo, logprobs, token_input): # assert ret_num_top_logprobs == logprobs, f"{ret_num_top_logprobs} vs {logprobs}" assert ret_num_top_logprobs > 0 - if first: + if is_first: if echo: assert response.choices[0].text.startswith( prompt - ), f"{response.choices[0].text} and all args {echo} {logprobs} {token_input} {first}" - first = False + ), f"{response.choices[0].text} and all args {echo} {logprobs} {token_input} {is_first}" + is_firsts[index] = False assert response.id assert response.created + for index in [i for i in range(parallel_sample_num * num_choices)]: + assert not is_firsts.get( + index, True + ), f"index {index} is not found in the response" + def run_chat_completion(self, logprobs, parallel_sample_num): client = openai.Client(api_key=self.api_key, base_url=self.base_url) response = client.chat.completions.create( @@ -172,7 +195,7 @@ def run_chat_completion(self, logprobs, parallel_sample_num): assert response.usage.completion_tokens > 0 assert response.usage.total_tokens > 0 - def run_chat_completion_stream(self, logprobs): + def run_chat_completion_stream(self, logprobs, parallel_sample_num=1): client = openai.Client(api_key=self.api_key, base_url=self.base_url) generator = client.chat.completions.create( model=self.model, @@ -185,9 +208,10 @@ def run_chat_completion_stream(self, logprobs): top_logprobs=logprobs, stream=True, stream_options={"include_usage": True}, + n=parallel_sample_num, ) - is_first = True + is_firsts = {} for response in generator: usage = response.usage if usage is not None: @@ -196,11 +220,12 @@ def run_chat_completion_stream(self, logprobs): assert usage.total_tokens > 0 continue + index = response.choices[0].index data = response.choices[0].delta - if is_first: - data.role == "assistant" - is_first = False + if is_firsts.get(index, True): + assert data.role == "assistant" + is_firsts[index] = False continue if logprobs: @@ -222,6 +247,11 @@ def run_chat_completion_stream(self, logprobs): assert response.id assert response.created + for index in [i for i in range(parallel_sample_num)]: + assert not is_firsts.get( + index, True + ), f"index {index} is not found in the response" + def run_batch(self, mode): client = openai.Client(api_key=self.api_key, base_url=self.base_url) if mode == "completion": @@ -320,7 +350,9 @@ def run_batch(self, mode): f"Batch job status: {batch_job.status}...trying again in 3 seconds..." ) batch_job = client.batches.retrieve(batch_job.id) - assert batch_job.status == "completed" + assert ( + batch_job.status == "completed" + ), f"Batch job status is not completed: {batch_job.status}" assert batch_job.request_counts.completed == len(content) assert batch_job.request_counts.failed == 0 assert batch_job.request_counts.total == len(content) @@ -353,8 +385,16 @@ def test_completion_stream(self): # parallel sampling adn list input are not supported in streaming mode for echo in [False, True]: for logprobs in [None, 5]: - for token_input in [False, True]: - self.run_completion_stream(echo, logprobs, token_input) + for use_list_input in [True, False]: + for parallel_sample_num in [1, 2]: + for token_input in [False, True]: + self.run_completion_stream( + echo, + logprobs, + use_list_input, + parallel_sample_num, + token_input, + ) def test_chat_completion(self): for logprobs in [None, 5]: @@ -363,7 +403,8 @@ def test_chat_completion(self): def test_chat_completion_stream(self): for logprobs in [None, 5]: - self.run_chat_completion_stream(logprobs) + for parallel_sample_num in [1, 2]: + self.run_chat_completion_stream(logprobs, parallel_sample_num) def test_batch(self): for mode in ["completion", "chat"]: diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 5e6bcbf60a..60f4cd58a3 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -23,7 +23,12 @@ def tearDownClass(cls): kill_child_process(cls.process.pid) def run_decode( - self, return_logprob=False, top_logprobs_num=0, return_text=False, n=1 + self, + return_logprob=False, + top_logprobs_num=0, + return_text=False, + n=1, + stream=False, ): response = requests.post( self.base_url + "/generate", @@ -34,14 +39,21 @@ def run_decode( "max_new_tokens": 32, "n": n, }, - "stream": False, + "stream": stream, "return_logprob": return_logprob, "top_logprobs_num": top_logprobs_num, "return_text_in_logprobs": return_text, "logprob_start_len": 0, }, ) - print(json.dumps(response.json())) + if not stream: + response_json = response.json() + else: + response_json = [] + for line in response.iter_lines(): + if line.startswith(b"data: ") and line[6:] != b"[DONE]": + response_json.append(json.loads(line[6:])) + print(json.dumps(response_json)) print("=" * 100) def test_simple_decode(self): @@ -50,6 +62,9 @@ def test_simple_decode(self): def test_parallel_sample(self): self.run_decode(n=3) + def test_parallel_sample_stream(self): + self.run_decode(n=3, stream=True) + def test_logprob(self): for top_logprobs_num in [0, 3]: for return_text in [True, False]: From a8ae640328f469b5cd9f1d1c21712c10fd0c5869 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 20 Aug 2024 08:31:29 -0700 Subject: [PATCH 055/118] Improve docs and warnings (#1164) --- .github/pull_request_template.md | 15 +++++++-------- README.md | 7 ++++--- python/sglang/srt/hf_transformers_utils.py | 13 ++++++------- python/sglang/srt/managers/schedule_batch.py | 4 ++-- python/sglang/srt/managers/tp_worker.py | 2 +- python/sglang/srt/model_executor/model_runner.py | 2 +- python/sglang/srt/openai_api/adapter.py | 6 ++++-- 7 files changed, 25 insertions(+), 24 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0926cfbe9c..21f9a21117 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,16 +1,15 @@ - + ## Motivation - + -## Modification +## Modifications - + ## Checklist -- [ ] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**. -- [ ] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues. -- [ ] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness. -- [ ] Modify documentation as needed, such as docstrings or example tutorials. +- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md). +- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md). +- [ ] Update documentation as needed, including docstrings or example tutorials. \ No newline at end of file diff --git a/README.md b/README.md index 9ac4fbb308..10b4f95fff 100644 --- a/README.md +++ b/README.md @@ -81,14 +81,17 @@ docker run --gpus all \ ### Method 4: Using docker compose +
> This method is recommended if you plan to serve it as a service. > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml). 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine 2. Execute the command `docker compose up -d` in your terminal. +
### Method 5: Run on Kubernetes or Clouds with SkyPilot +
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot). 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). @@ -114,8 +117,6 @@ run: | --port 30000 ``` -
- ```bash # Deploy on any cloud or Kubernetes cluster. Use --cloud to select a specific cloud provider. HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml @@ -124,7 +125,7 @@ HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml sky status --endpoint 30000 sglang ``` 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve). - + ### Common Notes diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index fb198fd73c..b3576b47b7 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -147,13 +147,12 @@ def get_tokenizer( and kwargs.get("use_fast", True) and tokenizer_name != _FAST_LLAMA_TOKENIZER ): - pass - # warnings.warn( - # "For some LLaMA V1 models, initializing the fast tokenizer may " - # "take a long time. To reduce the initialization time, consider " - # f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " - # "tokenizer." - # ) + warnings.warn( + "For some LLaMA V1 models, initializing the fast tokenizer may " + "take a long time. To reduce the initialization time, consider " + f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " + "tokenizer." + ) try: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 42c291bb17..14374e5806 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -270,7 +270,7 @@ def jump_forward_and_retokenize(self, jump_forward_str, next_state): if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]: # TODO(lsyin): fix token fusion - warnings.warn( + logging.warning( "Token fusion between input and output, try to avoid this by removing the space at the end of the input." ) return False @@ -791,7 +791,7 @@ def sample(self, logits: torch.Tensor, is_multi_node_tp=False): ) if not torch.all(success): - warnings.warn("Sampling failed, fallback to top_k=1 strategy") + logging.warning("Sampling failed, fallback to top_k=1 strategy") probs = probs.masked_fill(torch.isnan(probs), 0.0) argmax_ids = torch.argmax(probs, dim=-1) batch_next_token_ids = torch.where( diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index b6cfa68bd4..b8a4576f73 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -774,7 +774,7 @@ def flush_cache(self): torch.cuda.empty_cache() logger.info("Cache flushed successfully!") else: - warnings.warn( + logging.warning( f"Cache not flushed because there are pending requests. " f"#queue-req: {len(self.waiting_queue)}, " f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}" diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index b74a19e60d..2406addc8e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -237,7 +237,7 @@ def init_memory_pool( self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory) if max_total_tokens is not None: if max_total_tokens > self.max_total_num_tokens: - warnings.warn( + logging.warning( f"max_total_tokens={max_total_tokens} is larger than the profiled value " f"{self.max_total_num_tokens}. " f"Use the profiled value instead." diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 12b40d6c40..582457ae04 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -17,10 +17,10 @@ import asyncio import json +import logging import os import time import uuid -import warnings from http import HTTPStatus from typing import Dict, List, Optional @@ -65,6 +65,8 @@ UsageInfo, ) +logger = logging.getLogger(__name__) + chat_template_name = None @@ -408,7 +410,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]): "Parallel sampling is not supported for completions from files" ) if request.echo and request.logprobs: - warnings.warn( + logger.warning( "Echo is not compatible with logprobs. " "To compute logprobs of input prompt, please use SGLang /request API." ) From ff2cfdb1a21867700c21cf903dcd720c55ad60fe Mon Sep 17 00:00:00 2001 From: Xu-Chen <956140954@qq.com> Date: Tue, 20 Aug 2024 23:44:12 +0800 Subject: [PATCH 056/118] [Feature] add disable-custom-all-reduce (#1148) Co-authored-by: chenxu02 Co-authored-by: Yineng Zhang --- python/sglang/srt/model_executor/model_runner.py | 2 ++ python/sglang/srt/server_args.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 2406addc8e..bf89c637d9 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -37,6 +37,7 @@ get_tp_group, init_distributed_environment, initialize_model_parallel, + set_custom_all_reduce, ) from vllm.distributed.parallel_state import in_the_same_node_as from vllm.model_executor.model_loader import get_model @@ -105,6 +106,7 @@ def __init__( nccl_init_method = f"tcp://{server_args.nccl_init_addr}" else: nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}" + set_custom_all_reduce(not server_args.disable_custom_all_reduce) init_distributed_environment( backend="nccl", world_size=self.tp_size, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 99ecff6a58..c7120564c1 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -86,6 +86,7 @@ class ServerArgs: enable_mla: bool = False attention_reduce_in_fp32: bool = False efficient_weight_load: bool = False + disable_custom_all_reduce: bool = False # Distributed args nccl_init_addr: Optional[str] = None @@ -428,6 +429,12 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).", ) + parser.add_argument( + "--disable-custom-all-reduce", + action="store_true", + default=False, + help="Disable the custom all-reduce kernel and fall back to NCCL.", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace): From 04707b09b7240e19039c991ffc6981335c649caa Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 21 Aug 2024 02:14:51 +1000 Subject: [PATCH 057/118] misc: add hypervisor vendor (#1165) --- python/sglang/check_env.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index cc8ba10e00..4db1f82fc0 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -170,6 +170,17 @@ def get_gpu_topology(): return None +def get_hypervisor_vendor(): + try: + output = subprocess.check_output(["lscpu"], text=True) + for line in output.split("\n"): + if "Hypervisor vendor:" in line: + return line.split(":")[1].strip() + return None + except: + return None + + def check_env(): """ Check and print environment information. @@ -184,6 +195,10 @@ def check_env(): if gpu_topo: env_info["NVIDIA Topology"] = gpu_topo + hypervisor_vendor = get_hypervisor_vendor() + if hypervisor_vendor: + env_info["Hypervisor vendor"] = hypervisor_vendor + ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE) env_info["ulimit soft"] = ulimit_soft From 6242c399abb7582fb3d9a4e6a11f6af7d248841b Mon Sep 17 00:00:00 2001 From: Lucien Date: Wed, 21 Aug 2024 01:14:34 +0800 Subject: [PATCH 058/118] Generate 1 token to verify the health of the inference service in /health (#1154) Co-authored-by: Yineng Zhang --- python/sglang/srt/server.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 997b805cc8..55271c2352 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -89,6 +89,23 @@ tokenizer_manager = None +@app.get("/v1/health") +async def health(request: Request) -> Response: + """ + Generate 1 token to verify the health of the inference service. + """ + gri = GenerateReqInput( + text="s", sampling_params={"max_new_tokens": 1, "temperature": 0.7} + ) + try: + async for _ in tokenizer_manager.generate_request(gri, request): + break + return Response(status_code=200) + except Exception as e: + logger.exception(e) + return Response(status_code=503) + + @app.get("/health") async def health() -> Response: """Health check.""" From 350a81609b1e69194465a9dcbc7b8c1dd1a09e7c Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 21 Aug 2024 03:23:52 +1000 Subject: [PATCH 059/118] fix: resolve README render (#1166) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 10b4f95fff..a26f5dc5a9 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ run: | --host 0.0.0.0 \ --port 30000 ``` + ```bash # Deploy on any cloud or Kubernetes cluster. Use --cloud to select a specific cloud provider. From cd10654e7ed99616d25fc1d6958ae74b21531bd6 Mon Sep 17 00:00:00 2001 From: Shan Yu Date: Tue, 20 Aug 2024 13:48:24 -0700 Subject: [PATCH 060/118] [Feat] Support update weights without restart server (#1157) --- .../srt/managers/detokenizer_manager.py | 5 + python/sglang/srt/managers/io_struct.py | 14 +++ .../sglang/srt/managers/tokenizer_manager.py | 45 +++++++- python/sglang/srt/managers/tp_worker.py | 17 +++ .../sglang/srt/model_executor/model_runner.py | 106 ++++++++++++++++-- python/sglang/srt/server.py | 23 +++- test/srt/test_update_weights.py | 106 ++++++++++++++++++ 7 files changed, 303 insertions(+), 13 deletions(-) create mode 100644 test/srt/test_update_weights.py diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index 12511ac44e..e1402795fb 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -28,6 +28,7 @@ BatchEmbeddingOut, BatchStrOut, BatchTokenIDOut, + UpdateWeightReqOutput, ) from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR from sglang.srt.server_args import PortArgs, ServerArgs @@ -84,6 +85,10 @@ async def handle_loop(self): ) continue + if isinstance(recv_obj, UpdateWeightReqOutput): + self.send_to_tokenizer.send_pyobj(recv_obj) + continue + assert isinstance(recv_obj, BatchTokenIDOut) bs = len(recv_obj.rids) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 3a0ecd8f6c..dc82245931 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -278,6 +278,20 @@ class FlushCacheReq: pass +@dataclass +class UpdateWeightReqInput: + # The model path with the new weights + model_path: str + # The format to load the weights + load_format: Optional[str] = None + + +@dataclass +class UpdateWeightReqOutput: + success: bool + message: str + + @dataclass class AbortReq: # The request id diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index e157217e34..ab375a39a9 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -46,6 +46,8 @@ GenerateReqInput, TokenizedEmbeddingReqInput, TokenizedGenerateReqInput, + UpdateWeightReqInput, + UpdateWeightReqOutput, ) from sglang.srt.mm_utils import expand2square, process_anyres_image from sglang.srt.sampling_params import SamplingParams @@ -121,6 +123,10 @@ def __init__( self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} + # for update model weights + self.model_update_lock = asyncio.Lock() + self.model_update_result = None + async def get_pixel_values(self, image_data): aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None) grid_pinpoints = ( @@ -146,6 +152,9 @@ async def generate_request( if self.to_create_loop: self.create_handle_loop() + while self.model_update_lock.locked(): + await asyncio.sleep(0) + obj.post_init() is_single = obj.is_single @@ -513,6 +522,30 @@ def flush_cache(self): req = FlushCacheReq() self.send_to_router.send_pyobj(req) + async def update_weights(self, obj: UpdateWeightReqInput, request): + if self.to_create_loop: + self.create_handle_loop() + + # default the load format to the server_args + if obj.load_format is None: + obj.load_format = self.server_args.load_format + + if not self.model_update_lock.locked(): + async with self.model_update_lock: + # wait for the previous generation requests to finish + while len(self.rid_to_state) > 0: + await asyncio.sleep(0) + self.send_to_router.send_pyobj(obj) + self.model_update_result = asyncio.Future() + result = await self.model_update_result + if result.success: + self.server_args.model_path = obj.model_path + self.server_args.load_format = obj.load_format + self.model_path = obj.model_path + return result.success, result.message + else: + return False, "Another update is in progress. Please try again later." + def abort_request(self, rid: str): if rid not in self.rid_to_state: return @@ -541,12 +574,18 @@ def create_handle_loop(self): async def handle_loop(self): while True: - recv_obj: Union[BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut] = ( - await self.recv_from_detokenizer.recv_pyobj() - ) + recv_obj: Union[ + BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut, UpdateWeightReqOutput + ] = await self.recv_from_detokenizer.recv_pyobj() + + if isinstance(recv_obj, UpdateWeightReqOutput): + self.model_update_result.set_result(recv_obj) + continue + assert isinstance( recv_obj, (BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut) ), f"Unexpected obj received: {type(recv_obj)}" + for i, rid in enumerate(recv_obj.rids): state = self.rid_to_state.get(rid, None) if state is None: diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index b8a4576f73..7bd2e38129 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -39,6 +39,8 @@ FlushCacheReq, TokenizedEmbeddingReqInput, TokenizedGenerateReqInput, + UpdateWeightReqInput, + UpdateWeightReqOutput, ) from sglang.srt.managers.policy_scheduler import PolicyScheduler, PrefillAdder from sglang.srt.managers.schedule_batch import ( @@ -214,6 +216,9 @@ def exposed_step(self, recv_reqs: List): self.flush_cache() elif isinstance(recv_req, AbortReq): self.abort_request(recv_req) + elif isinstance(recv_req, UpdateWeightReqInput): + success, message = self.update_weights(recv_req) + self.out_pyobjs.append(UpdateWeightReqOutput(success, message)) else: raise ValueError(f"Invalid request: {recv_req}") @@ -773,12 +778,15 @@ def flush_cache(self): self.token_to_kv_pool.clear() torch.cuda.empty_cache() logger.info("Cache flushed successfully!") + if_success = True else: logging.warning( f"Cache not flushed because there are pending requests. " f"#queue-req: {len(self.waiting_queue)}, " f"#running-req: {0 if self.running_batch is None else len(self.running_batch.reqs)}" ) + if_success = False + return if_success def abort_request(self, recv_req): # Delete requests in the waiting queue @@ -798,6 +806,15 @@ def abort_request(self, recv_req): req.finished_reason = FINISH_ABORT() break + def update_weights(self, recv_req): + success, message = self.model_runner.update_weights( + recv_req.model_path, recv_req.load_format + ) + if success: + flash_cache_success = self.flush_cache() + assert flash_cache_success, "Cache flush failed after updating weights" + return success, message + def run_tp_server( gpu_id: int, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index bf89c637d9..4a3396cf2c 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -15,6 +15,7 @@ """ModelRunner runs the forward passes of the models.""" +import gc import importlib import importlib.resources import logging @@ -157,9 +158,9 @@ def load_model(self): self.server_args.dtype = "float16" monkey_patch_vllm_dummy_weight_loader() - device_config = DeviceConfig() - load_config = LoadConfig(load_format=self.server_args.load_format) - vllm_model_config = VllmModelConfig( + self.device_config = DeviceConfig() + self.load_config = LoadConfig(load_format=self.server_args.load_format) + self.vllm_model_config = VllmModelConfig( model=self.server_args.model_path, quantization=self.server_args.quantization, tokenizer=None, @@ -173,17 +174,19 @@ def load_model(self): if is_llama3_405b_fp8_head_16(self.model_config) and self.tp_size <= 8: # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints self.model_config.hf_config.num_key_value_heads = 8 - vllm_model_config.hf_config.num_key_value_heads = 8 + self.vllm_model_config.hf_config.num_key_value_heads = 8 monkey_patch_vllm_qvk_linear_loader() - self.dtype = vllm_model_config.dtype + self.dtype = self.vllm_model_config.dtype if self.model_config.model_overide_args is not None: - vllm_model_config.hf_config.update(self.model_config.model_overide_args) + self.vllm_model_config.hf_config.update( + self.model_config.model_overide_args + ) self.model = get_model( - model_config=vllm_model_config, - device_config=device_config, - load_config=load_config, + model_config=self.vllm_model_config, + device_config=self.device_config, + load_config=self.load_config, lora_config=None, multimodal_config=None, parallel_config=None, @@ -206,6 +209,91 @@ def load_model(self): f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) + def update_weights(self, model_path, load_format): + from vllm.model_executor.model_loader.loader import ( + DefaultModelLoader, + device_loading_context, + get_model_loader, + ) + from vllm.model_executor.model_loader.utils import set_default_torch_dtype + + logger.info( + f"[gpu={self.gpu_id}] Update weights begin. " + f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" + ) + + target_device = torch.device(self.device_config.device) + + try: + vllm_model_config = VllmModelConfig( + model=model_path, + quantization=self.server_args.quantization, + tokenizer=None, + tokenizer_mode=None, + trust_remote_code=self.server_args.trust_remote_code, + dtype=self.server_args.dtype, + seed=42, + skip_tokenizer_init=True, + ) + except Exception as e: + logger.error(f"Failed to load model config: {e}") + return False, "Failed to update model weights" + + load_config = LoadConfig(load_format=load_format) + + # Only support vllm DefaultModelLoader for now + loader = get_model_loader(load_config) + if not isinstance(loader, DefaultModelLoader): + logger.error("Failed to get weights iterator: Unsupported loader") + return False, "Failed to update model weights" + + def get_weight_iter(config): + iter = loader._get_weights_iterator( + config.model, + config.revision, + fall_back_to_pt=getattr( + self.model, "fall_back_to_pt_during_load", True + ), + ) + return iter + + def model_load_weights(model, iter): + model.load_weights(iter) + for _, module in self.model.named_modules(): + quant_method = getattr(module, "quant_method", None) + if quant_method is not None: + with device_loading_context(module, target_device): + quant_method.process_weights_after_loading(module) + return model + + with set_default_torch_dtype(vllm_model_config.dtype): + try: + iter = get_weight_iter(vllm_model_config) + except Exception as e: + message = f"Failed to get weights iterator: {e}" + logger.error(message) + return False, message + try: + model = model_load_weights(self.model, iter) + except Exception as e: + message = f"Failed to update weights: {e}. \n Rolling back to original weights" + logger.error(message) + del iter + gc.collect() + iter = get_weight_iter(self.vllm_model_config) + self.model = model_load_weights(self.model, iter) + return False, message + + self.model = model + self.server_args.model_path = model_path + self.server_args.load_format = load_format + self.vllm_model_config = vllm_model_config + self.load_config = load_config + self.model_config.path = model_path + + logger.info(f"[gpu={self.gpu_id}] Update weights end.") + return True, "Succeeded to update model weights" + def profile_max_num_token(self, total_gpu_memory): available_gpu_memory = get_available_gpu_memory( self.gpu_id, distributed=self.tp_size > 1 diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 55271c2352..0c5a3c706b 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -51,7 +51,11 @@ start_controller_process as start_controller_process_single, ) from sglang.srt.managers.detokenizer_manager import start_detokenizer_process -from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput +from sglang.srt.managers.io_struct import ( + EmbeddingReqInput, + GenerateReqInput, + UpdateWeightReqInput, +) from sglang.srt.managers.tokenizer_manager import TokenizerManager from sglang.srt.openai_api.adapter import ( load_chat_template_for_openai_api, @@ -136,6 +140,23 @@ async def flush_cache(): ) +@app.post("/update_weights") +async def update_weights(obj: UpdateWeightReqInput, request: Request): + + success, message = await tokenizer_manager.update_weights(obj, request) + content = {"message": message, "success": str(success)} + if success: + return JSONResponse( + content, + status_code=HTTPStatus.OK, + ) + else: + return JSONResponse( + content, + status_code=HTTPStatus.BAD_REQUEST, + ) + + async def generate_request(obj: GenerateReqInput, request: Request): """Handle a generate request.""" if obj.stream: diff --git a/test/srt/test_update_weights.py b/test/srt/test_update_weights.py new file mode 100644 index 0000000000..64f84263aa --- /dev/null +++ b/test/srt/test_update_weights.py @@ -0,0 +1,106 @@ +import json +import unittest + +import requests + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, + popen_launch_server, +) + + +class TestReplaceWeights(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def run_decode(self): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": 32, + "n": 1, + }, + "stream": False, + "return_logprob": False, + "top_logprobs_num": 0, + "return_text_in_logprobs": False, + "logprob_start_len": 0, + }, + ) + print(json.dumps(response.json())) + print("=" * 100) + # return the "text" in response + text = response.json()["text"] + return text + + def get_model_info(self): + response = requests.get(self.base_url + "/get_model_info") + model_path = response.json()["model_path"] + print(json.dumps(response.json())) + return model_path + + def run_update_weights(self, model_path): + response = requests.post( + self.base_url + "/update_weights", + json={ + "model_path": model_path, + }, + ) + print(json.dumps(response.json())) + + def test_replace_weights(self): + origin_model_path = self.get_model_info() + print(f"origin_model_path: {origin_model_path}") + origin_response = self.run_decode() + + # update weights + new_model_path = "meta-llama/Meta-Llama-3.1-8B" + self.run_update_weights(new_model_path) + + updated_model_path = self.get_model_info() + print(f"updated_model_path: {updated_model_path}") + assert updated_model_path == new_model_path + assert updated_model_path != origin_model_path + + updated_response = self.run_decode() + assert origin_response[:32] != updated_response[:32] + + # update weights back + self.run_update_weights(origin_model_path) + updated_model_path = self.get_model_info() + assert updated_model_path == origin_model_path + + updated_response = self.run_decode() + assert origin_response[:32] == updated_response[:32] + + def test_replace_weights_unexist_model(self): + origin_model_path = self.get_model_info() + print(f"origin_model_path: {origin_model_path}") + origin_response = self.run_decode() + + # update weights + new_model_path = "meta-llama/Meta-Llama-3.1-8B-1" + self.run_update_weights(new_model_path) + + updated_model_path = self.get_model_info() + print(f"updated_model_path: {updated_model_path}") + assert updated_model_path == origin_model_path + + updated_response = self.run_decode() + assert origin_response[:32] == updated_response[:32] + + +if __name__ == "__main__": + unittest.main() From bea2bb9eeae6cf6f1bdfbb6aaaae2d91adea7bac Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 20 Aug 2024 22:35:05 -0700 Subject: [PATCH 061/118] Improve multi-node stability (#1171) --- python/sglang/launch_server.py | 9 ++- python/sglang/srt/hf_transformers_utils.py | 16 ++-- .../sglang/srt/managers/controller_multi.py | 2 - .../sglang/srt/managers/controller_single.py | 2 - python/sglang/srt/managers/schedule_batch.py | 20 ++--- python/sglang/srt/managers/tp_worker.py | 16 ++-- .../srt/model_executor/cuda_graph_runner.py | 14 +++- .../sglang/srt/model_executor/model_runner.py | 1 + python/sglang/srt/server.py | 73 +++++++++---------- python/sglang/srt/server_args.py | 6 ++ python/sglang/srt/utils.py | 11 ++- 11 files changed, 94 insertions(+), 76 deletions(-) diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py index 91dc0dc4e9..1df64e848c 100644 --- a/python/sglang/launch_server.py +++ b/python/sglang/launch_server.py @@ -1,9 +1,11 @@ """Launch the inference server.""" import argparse +import os from sglang.srt.server import launch_server from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import kill_child_process if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -11,4 +13,9 @@ args = parser.parse_args() server_args = ServerArgs.from_cli_args(args) - launch_server(server_args) + try: + launch_server(server_args) + except Exception as e: + raise e + finally: + kill_child_process(os.getpid(), including_parent=False) diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index b3576b47b7..525d295439 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -233,6 +233,8 @@ def __init__(self, tokenizer_path): } assert tok_dict["word_split"] == "V1" + default_allowed_special = None + kwargs = { "name": name, "pat_str": tok_dict.get("pat_str", PAT_STR_B), @@ -246,14 +248,18 @@ def __init__(self, tokenizer_path): for bytes_list in tok_dict["default_allowed_special"] ] ) - else: - default_allowed_special = None if "vocab_size" in tok_dict: kwargs["explicit_n_vocab"] = tok_dict["vocab_size"] + PAD = "<|pad|>" + EOS = "<|eos|>" + SEP = "<|separator|>" + + DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP} + tokenizer = tiktoken.Encoding(**kwargs) tokenizer._default_allowed_special = default_allowed_special or set() - tokenizer._default_allowed_special |= {"<|separator|>"} + tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS def encode_patched( self, @@ -270,14 +276,14 @@ def encode_patched( self, text, allowed_special=allowed_special, - disallowed_special=disallowed_special, + disallowed_special=(), ) tokenizer.encode = functools.partial(encode_patched, tokenizer) # Convert to HF interface self.tokenizer = tokenizer - self.eos_token_id = tokenizer._special_tokens["<|eos|>"] + self.eos_token_id = tokenizer._special_tokens[EOS] self.vocab_size = tokenizer.n_vocab self.chat_template = Template( "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" diff --git a/python/sglang/srt/managers/controller_multi.py b/python/sglang/srt/managers/controller_multi.py index dcd984e0f2..58c4f4484a 100644 --- a/python/sglang/srt/managers/controller_multi.py +++ b/python/sglang/srt/managers/controller_multi.py @@ -212,6 +212,4 @@ def start_controller_process( except Exception: logger.error("Exception in ControllerMulti:\n" + get_exception_traceback()) finally: - for w in controller.workers: - os.kill(w.proc.pid, 9) kill_parent_process() diff --git a/python/sglang/srt/managers/controller_single.py b/python/sglang/srt/managers/controller_single.py index 415325b131..a3402c62f7 100644 --- a/python/sglang/srt/managers/controller_single.py +++ b/python/sglang/srt/managers/controller_single.py @@ -167,6 +167,4 @@ def start_controller_process( except Exception: logger.error("Exception in ControllerSingle:\n" + get_exception_traceback()) finally: - for t in controller.tp_procs: - os.kill(t.pid, 9) kill_parent_process() diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 14374e5806..1437d0e6cd 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -16,7 +16,6 @@ """Meta data for requests and batches""" import logging -import warnings from dataclasses import dataclass from typing import List, Optional, Union @@ -270,7 +269,7 @@ def jump_forward_and_retokenize(self, jump_forward_str, next_state): if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]: # TODO(lsyin): fix token fusion - logging.warning( + logger.warning( "Token fusion between input and output, try to avoid this by removing the space at the end of the input." ) return False @@ -753,7 +752,7 @@ def merge(self, other: "ScheduleBatch"): ) self.logit_bias = torch.concat([self.logit_bias, other.logit_bias]) - def sample(self, logits: torch.Tensor, is_multi_node_tp=False): + def sample(self, logits: torch.Tensor): # TODO(lsyin): move this into a part of layer and run with CUDA Graph # Post process logits logits = logits.contiguous() @@ -791,7 +790,7 @@ def sample(self, logits: torch.Tensor, is_multi_node_tp=False): ) if not torch.all(success): - logging.warning("Sampling failed, fallback to top_k=1 strategy") + logger.warning(f"Sampling failed. Fallback to top_k=1 strategy. {logits=}") probs = probs.masked_fill(torch.isnan(probs), 0.0) argmax_ids = torch.argmax(probs, dim=-1) batch_next_token_ids = torch.where( @@ -808,16 +807,6 @@ def sample(self, logits: torch.Tensor, is_multi_node_tp=False): self.penalizer_orchestrator.cumulate_output_tokens(batch_next_token_ids) - if is_multi_node_tp: - # If the tensor parallelism spans across multiple nodes, there is some indeterminism - # that can cause the TP workers to generate different tokens, so we need to - # sync here - torch.distributed.all_reduce( - batch_next_token_ids, - op=dist.ReduceOp.MIN, - group=get_tensor_model_parallel_group().device_group, - ) - return batch_next_token_ids @@ -835,7 +824,8 @@ def top_k_top_p_sampling_from_probs_torch( probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) try: sampled_index = torch.multinomial(probs_sort, num_samples=1) - except RuntimeError: + except RuntimeError as e: + logger.warning(f"Sampling error: {e}") batch_next_token_ids = torch.zeros( (probs_sort.shape[0],), dtype=torch.int32, device=probs.device ) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 7bd2e38129..8772a4abbb 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -133,6 +133,13 @@ def __init__( self.model_config.context_len - 1, self.max_total_num_tokens - 1, ) + + # Sync random seed + server_args.random_seed = broadcast_recv_input( + [server_args.random_seed], + self.tp_rank, + self.model_runner.tp_group.cpu_group, + )[0] set_random_seed(server_args.random_seed) # Print info @@ -474,9 +481,7 @@ def forward_prefill_batch(self, batch: ScheduleBatch): # Forward and sample the next tokens if batch.extend_num_tokens != 0: output = self.model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample( - output.next_token_logits, self.model_runner.is_multi_node_tp - ) + next_token_ids = batch.sample(output.next_token_logits) # Move logprobs to cpu if output.next_token_logprobs is not None: @@ -636,9 +641,7 @@ def forward_decode_batch(self, batch: ScheduleBatch): # Forward and sample the next tokens output = self.model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample( - output.next_token_logits, self.model_runner.is_multi_node_tp - ) + next_token_ids = batch.sample(output.next_token_logits) # Move logprobs to cpu if output.next_token_logprobs is not None: @@ -879,6 +882,7 @@ def broadcast_recv_input( dist.broadcast(tensor_size, src=0, group=dist_group) dist.broadcast(tensor_data, src=0, group=dist_group) + return data else: tensor_size = torch.tensor([0], dtype=torch.long) dist.broadcast(tensor_size, src=0, group=dist_group) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index af39065cfa..d045be56d8 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -84,13 +84,20 @@ def set_torch_compile_config(): class CudaGraphRunner: - def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): + def __init__( + self, + model_runner, + max_batch_size_to_capture: int, + use_torch_compile: bool, + disable_padding: bool, + ): self.model_runner = model_runner self.graphs = {} self.input_buffers = {} self.output_buffers = {} self.flashinfer_handlers = {} self.graph_memory_pool = None + self.disable_padding = disable_padding # Common inputs self.max_bs = max_batch_size_to_capture @@ -142,7 +149,10 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): set_torch_compile_config() def can_run(self, batch_size): - return batch_size <= self.max_bs + if self.disable_padding: + return batch_size in self.graphs + else: + return batch_size <= self.max_bs def capture(self, batch_size_list): self.batch_size_list = batch_size_list diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 4a3396cf2c..a00a73945c 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -465,6 +465,7 @@ def init_cuda_graphs(self): self, max_batch_size_to_capture=max(batch_size_list), use_torch_compile=self.server_args.enable_torch_compile, + disable_padding=self.server_args.disable_cuda_graph_padding, ) try: self.cuda_graph_runner.capture(batch_size_list) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 0c5a3c706b..fbe3374df7 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -24,7 +24,6 @@ import logging import multiprocessing as mp import os -import sys import threading import time from http import HTTPStatus @@ -301,27 +300,29 @@ def launch_server( server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path) # Launch processes for multi-node tensor parallelism - if server_args.nnodes > 1: - if server_args.node_rank != 0: - tp_size_local = server_args.tp_size // server_args.nnodes - gpu_ids = [ - i for _ in range(server_args.nnodes) for i in range(tp_size_local) - ] - tp_rank_range = list( - range( - server_args.node_rank * tp_size_local, - (server_args.node_rank + 1) * tp_size_local, - ) + if server_args.nnodes > 1 and server_args.node_rank != 0: + tp_size_local = server_args.tp_size // server_args.nnodes + gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)] + tp_rank_range = list( + range( + server_args.node_rank * tp_size_local, + (server_args.node_rank + 1) * tp_size_local, ) - procs = launch_tp_servers( - gpu_ids, - tp_rank_range, - server_args, - ports[3], - model_overide_args, - ) - while True: - pass + ) + procs = launch_tp_servers( + gpu_ids, + tp_rank_range, + server_args, + ports[3], + model_overide_args, + ) + + try: + for p in procs: + p.join() + finally: + kill_child_process(os.getpid(), including_parent=False) + return # Launch processes tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args) @@ -356,15 +357,11 @@ def launch_server( if controller_init_state != "init ok" or detoken_init_state != "init ok": proc_controller.kill() proc_detoken.kill() - print( - f"Initialization failed. controller_init_state: {controller_init_state}", - flush=True, + raise RuntimeError( + "Initialization failed. " + f"controller_init_state: {controller_init_state}, " + f"detoken_init_state: {detoken_init_state}" ) - print( - f"Initialization failed. detoken_init_state: {detoken_init_state}", - flush=True, - ) - sys.exit(1) assert proc_controller.is_alive() and proc_detoken.is_alive() # Add api key authorization @@ -373,12 +370,12 @@ def launch_server( # Send a warmup request t = threading.Thread( - target=_wait_and_warmup, args=(server_args, pipe_finish_writer) + target=_wait_and_warmup, args=(server_args, pipe_finish_writer, os.getpid()) ) t.start() - # Listen for requests try: + # Listen for requests uvicorn.run( app, host=server_args.host, @@ -426,7 +423,7 @@ def _set_envs_and_config(server_args: ServerArgs): ) -def _wait_and_warmup(server_args, pipe_finish_writer): +def _wait_and_warmup(server_args, pipe_finish_writer, pid): headers = {} url = server_args.url() if server_args.api_key: @@ -449,8 +446,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer): if not success: if pipe_finish_writer is not None: pipe_finish_writer.send(last_traceback) - print(f"Initialization failed. warmup error: {last_traceback}", flush=True) - sys.exit(1) + logger.error(f"Initialization failed. warmup error: {last_traceback}") + kill_child_process(pid, including_parent=False) + return # Send a warmup request request_name = "/generate" if model_info["is_generation"] else "/encode" @@ -475,12 +473,13 @@ def _wait_and_warmup(server_args, pipe_finish_writer): timeout=600, ) assert res.status_code == 200, f"{res}" - except Exception as e: + except Exception: last_traceback = get_exception_traceback() if pipe_finish_writer is not None: pipe_finish_writer.send(last_traceback) - print(f"Initialization failed. warmup error: {last_traceback}", flush=True) - sys.exit(1) + logger.error(f"Initialization failed. warmup error: {last_traceback}") + kill_child_process(pid, including_parent=False) + return logger.info("The server is fired up and ready to roll!") if pipe_finish_writer is not None: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index c7120564c1..4dd5baccac 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -79,6 +79,7 @@ class ServerArgs: disable_radix_cache: bool = False disable_regex_jump_forward: bool = False disable_cuda_graph: bool = False + disable_cuda_graph_padding: bool = False disable_disk_cache: bool = False enable_mixed_chunk: bool = False enable_torch_compile: bool = False @@ -393,6 +394,11 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Disable cuda graph.", ) + parser.add_argument( + "--disable-cuda-graph-padding", + action="store_true", + help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.", + ) parser.add_argument( "--disable-disk-cache", action="store_true", diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 9761c851a5..a15ea16307 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -369,14 +369,11 @@ def kill_parent_process(): """Kill the parent process and all children of the parent process.""" current_process = psutil.Process() parent_process = current_process.parent() - children = parent_process.children(recursive=True) - for child in children: - if child.pid != current_process.pid: - os.kill(child.pid, 9) - os.kill(parent_process.pid, 9) + kill_child_process(parent_process.pid, skip_pid=current_process.pid) -def kill_child_process(pid, including_parent=True): +def kill_child_process(pid, including_parent=True, skip_pid=None): + """Kill the process and all its children process.""" try: parent = psutil.Process(pid) except psutil.NoSuchProcess: @@ -384,6 +381,8 @@ def kill_child_process(pid, including_parent=True): children = parent.children(recursive=True) for child in children: + if child.pid == skip_pid: + continue try: child.kill() except psutil.NoSuchProcess: From 1fb94599087e4881c8b31dc4de46b1685fcaa124 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 22 Aug 2024 07:26:35 +1000 Subject: [PATCH 062/118] fix: custom op fallback forward native when lower sm80 (#1177) --- python/sglang/srt/layers/activation.py | 7 +++++++ python/sglang/srt/layers/layernorm.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 7cd8abb6f9..a6f05610bd 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -20,11 +20,18 @@ class SiluAndMul(CustomOp): + def __init__(self, **kwargs): + super().__init__() + self.is_lower_sm80 = torch.cuda.get_device_capability()[0] < 8 + def forward_native(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 return F.silu(x[..., :d]) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + if self.is_lower_sm80: + return self.forward_native(x) + d = x.shape[-1] // 2 output_shape = x.shape[:-1] + (d,) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index ac4d368d3f..6cea85404a 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -32,12 +32,15 @@ def __init__( super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps + self.is_lower_sm80 = torch.cuda.get_device_capability()[0] < 8 def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if self.is_lower_sm80: + return self.forward_native(x, residual) if residual is not None: fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon) From d6aeb9fa1552939e7444d845a9d0f5e9225daf02 Mon Sep 17 00:00:00 2001 From: rainred <107027757+gryffindor-rr@users.noreply.github.com> Date: Thu, 22 Aug 2024 05:28:35 +0800 Subject: [PATCH 063/118] [Feature] Add a function to convert sampling_params to kwargs (#1170) Co-authored-by: lzhang --- python/sglang/srt/sampling_params.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/sglang/srt/sampling_params.py b/python/sglang/srt/sampling_params.py index 6a8823cc4d..712827d792 100644 --- a/python/sglang/srt/sampling_params.py +++ b/python/sglang/srt/sampling_params.py @@ -123,3 +123,17 @@ def normalize(self, tokenizer): else: stop_str_max_len = max(stop_str_max_len, len(stop_str)) self.stop_str_max_len = stop_str_max_len + + def to_srt_kwargs(self): + return { + "max_new_tokens": self.max_new_tokens, + "stop": self.stop_strs, + "stop_token_ids": list(self.stop_token_ids), + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + "frequency_penalty": self.frequency_penalty, + "presence_penalty": self.presence_penalty, + "ignore_eos": self.ignore_eos, + "regex": self.regex, + } From 068e9eae55daf2ca1666cfa64ad66139b02fa623 Mon Sep 17 00:00:00 2001 From: intervitens <155717317+intervitens@users.noreply.github.com> Date: Thu, 22 Aug 2024 01:49:32 +0300 Subject: [PATCH 064/118] Support min-p sampling (#1167) --- docs/en/sampling_params.md | 2 + python/sglang/api.py | 6 +++ python/sglang/lang/compiler.py | 4 ++ python/sglang/lang/interpreter.py | 1 + python/sglang/lang/ir.py | 9 +++++ python/sglang/srt/managers/schedule_batch.py | 41 +++++++++++++++----- python/sglang/srt/sampling_params.py | 4 ++ 7 files changed, 58 insertions(+), 9 deletions(-) diff --git a/docs/en/sampling_params.md b/docs/en/sampling_params.md index 5f1cdece6a..7d866e6929 100644 --- a/docs/en/sampling_params.md +++ b/docs/en/sampling_params.md @@ -45,6 +45,8 @@ temperature: float = 1.0, top_p: float = 1.0, # Top-k sampling top_k: int = -1, +# Min-p sampling +min_p: float = 0.0, # Whether to ignore EOS token. ignore_eos: bool = False, # Whether to skip the special tokens during detokenization. diff --git a/python/sglang/api.py b/python/sglang/api.py index 887ffce76e..3a2f747bec 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -66,6 +66,7 @@ def gen( temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, + min_p: Optional[float] = None, frequency_penalty: Optional[float] = None, presence_penalty: Optional[float] = None, ignore_eos: Optional[bool] = None, @@ -103,6 +104,7 @@ def gen( temperature, top_p, top_k, + min_p, frequency_penalty, presence_penalty, ignore_eos, @@ -123,6 +125,7 @@ def gen_int( temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, + min_p: Optional[float] = None, frequency_penalty: Optional[float] = None, presence_penalty: Optional[float] = None, ignore_eos: Optional[bool] = None, @@ -139,6 +142,7 @@ def gen_int( temperature, top_p, top_k, + min_p, frequency_penalty, presence_penalty, ignore_eos, @@ -159,6 +163,7 @@ def gen_string( temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, + min_p: Optional[float] = None, frequency_penalty: Optional[float] = None, presence_penalty: Optional[float] = None, ignore_eos: Optional[bool] = None, @@ -175,6 +180,7 @@ def gen_string( temperature, top_p, top_k, + min_p, frequency_penalty, presence_penalty, ignore_eos, diff --git a/python/sglang/lang/compiler.py b/python/sglang/lang/compiler.py index 95af04adb0..5e1b411fc2 100644 --- a/python/sglang/lang/compiler.py +++ b/python/sglang/lang/compiler.py @@ -130,6 +130,7 @@ def run( temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, + min_p: float = 0.0, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, backend=None, @@ -145,6 +146,7 @@ def run( temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, ) @@ -160,6 +162,7 @@ def run_batch( temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, + min_p: float = 0.0, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, backend=None, @@ -178,6 +181,7 @@ def run_batch( temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, ) diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index 844c9d062b..306d280c7f 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -663,6 +663,7 @@ def _resolve_sampling_params(self, sampling_params): "temperature", "top_p", "top_k", + "min_p", "frequency_penalty", "presence_penalty", "ignore_eos", diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 9db5f2719e..199a7ac7a4 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -22,6 +22,7 @@ class SglSamplingParams: temperature: float = 1.0 top_p: float = 1.0 top_k: int = -1 # -1 means disable + min_p: float = 0.0 frequency_penalty: float = 0.0 presence_penalty: float = 0.0 ignore_eos: bool = False @@ -42,6 +43,7 @@ def clone(self): self.temperature, self.top_p, self.top_k, + self.min_p, self.frequency_penalty, self.presence_penalty, self.ignore_eos, @@ -114,6 +116,7 @@ def to_srt_kwargs(self): "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, + "min_p": self.min_p, "frequency_penalty": self.frequency_penalty, "presence_penalty": self.presence_penalty, "ignore_eos": self.ignore_eos, @@ -149,6 +152,7 @@ def run( temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, + min_p: float = 0.0, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, ignore_eos: bool = False, @@ -169,6 +173,7 @@ def run( temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, ignore_eos=ignore_eos, @@ -190,6 +195,7 @@ def run_batch( temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, + min_p: float = 0.0, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, ignore_eos: bool = False, @@ -228,6 +234,7 @@ def run_batch( temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, ignore_eos=ignore_eos, @@ -408,6 +415,7 @@ def __init__( temperature: Optional[float] = None, top_p: Optional[float] = None, top_k: Optional[int] = None, + min_p: Optional[float] = None, frequency_penalty: Optional[float] = None, presence_penalty: Optional[float] = None, ignore_eos: Optional[bool] = None, @@ -428,6 +436,7 @@ def __init__( temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, ignore_eos=ignore_eos, diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 1437d0e6cd..9abce6f9b9 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -21,7 +21,12 @@ import torch import torch.distributed as dist -from flashinfer.sampling import top_k_top_p_sampling_from_probs +from flashinfer.sampling import ( + min_p_sampling_from_probs, + top_k_renorm_prob, + top_k_top_p_sampling_from_probs, + top_p_renorm_prob, +) from vllm.distributed import get_tensor_model_parallel_group import sglang.srt.sampling.penaltylib as penaltylib @@ -339,6 +344,7 @@ class ScheduleBatch: temperatures: torch.Tensor = None top_ps: torch.Tensor = None top_ks: torch.Tensor = None + min_ps: torch.Tensor = None penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None logit_bias: torch.Tensor = None @@ -403,6 +409,9 @@ def batch_sampling_params(self, vocab_size): self.top_ks = torch.tensor( [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device ) + self.min_ps = torch.tensor( + [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device + ) # Each penalizers will do nothing if they evaluate themselves as not required by looking at # the sampling_params of the requests (See {_is_required()} of each penalizers). So this @@ -701,6 +710,7 @@ def filter_batch(self, unfinished_indices: List[int]): "temperatures", "top_ps", "top_ks", + "min_ps", "logit_bias", ]: self_val = getattr(self, item, None) @@ -730,6 +740,7 @@ def merge(self, other: "ScheduleBatch"): "temperatures", "top_ps", "top_ks", + "min_ps", ]: self_val = getattr(self, item, None) other_val = getattr(other, item, None) @@ -780,13 +791,20 @@ def sample(self, logits: torch.Tensor): uniform_samples = torch.rand( (max_top_k_round, batch_size), device=probs.device ) - batch_next_token_ids, success = top_k_top_p_sampling_from_probs( - probs, uniform_samples, self.top_ks, self.top_ps - ) + if self.min_ps.any(): + probs = top_k_renorm_prob(probs, self.top_ks) + probs = top_p_renorm_prob(probs, self.top_ps) + batch_next_token_ids, success = min_p_sampling_from_probs( + probs, uniform_samples, self.min_ps + ) + else: + batch_next_token_ids, success = top_k_top_p_sampling_from_probs( + probs, uniform_samples, self.top_ks, self.top_ps + ) else: # Here we provide a slower fallback implementation. - batch_next_token_ids, success = top_k_top_p_sampling_from_probs_torch( - probs, self.top_ks, self.top_ps + batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch( + probs, self.top_ks, self.top_ps, self.min_ps ) if not torch.all(success): @@ -810,17 +828,22 @@ def sample(self, logits: torch.Tensor): return batch_next_token_ids -def top_k_top_p_sampling_from_probs_torch( - probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor +def top_k_top_p_min_p_sampling_from_probs_torch( + probs: torch.Tensor, + top_ks: torch.Tensor, + top_ps: torch.Tensor, + min_ps: torch.Tensor, ): - """A top-k and top-k sampling implementation with native pytorch operations.""" + """A top-k, top-p and min-p sampling implementation with native pytorch operations.""" probs_sort, probs_idx = probs.sort(dim=-1, descending=True) probs_sum = torch.cumsum(probs_sort, dim=-1) + min_p_thresholds = probs_sort[:, 0] * min_ps probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0 probs_sort[ torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1) >= top_ks.view(-1, 1) ] = 0.0 + probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0 probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) try: sampled_index = torch.multinomial(probs_sort, num_samples=1) diff --git a/python/sglang/srt/sampling_params.py b/python/sglang/srt/sampling_params.py index 712827d792..c30717dd7c 100644 --- a/python/sglang/srt/sampling_params.py +++ b/python/sglang/srt/sampling_params.py @@ -30,6 +30,7 @@ def __init__( temperature: float = 1.0, top_p: float = 1.0, top_k: int = -1, + min_p: float = 0.0, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repetition_penalty: float = 1.0, @@ -42,6 +43,7 @@ def __init__( self.temperature = temperature self.top_p = top_p self.top_k = top_k + self.min_p = min_p self.frequency_penalty = frequency_penalty self.presence_penalty = presence_penalty self.repetition_penalty = repetition_penalty @@ -69,6 +71,8 @@ def verify(self): ) if not 0.0 < self.top_p <= 1.0: raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") + if not 0.0 <= self.min_p <= 1.0: + raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.") if self.top_k < -1 or self.top_k == 0: raise ValueError( f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}." From ac1b74fa8548adf4f3b3a14b737702158c95c8d9 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 21 Aug 2024 16:05:33 -0700 Subject: [PATCH 065/118] [Docs] Fix rendering of details in README (#1179) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index a26f5dc5a9..c7d47d6786 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ docker run --gpus all \ ### Method 4: Using docker compose
+ > This method is recommended if you plan to serve it as a service. > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml). @@ -92,6 +93,7 @@ docker run --gpus all \ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot). 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). From 83e23c69b35ce26857ee415b243812973fdb9573 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Wed, 21 Aug 2024 16:48:24 -0700 Subject: [PATCH 066/118] Improve code style of sampler (#1168) --- examples/usage/json_decode.py | 3 + python/sglang/bench_latency.py | 2 +- python/sglang/srt/layers/sampler.py | 101 +++++++++ python/sglang/srt/managers/io_struct.py | 2 +- python/sglang/srt/managers/schedule_batch.py | 199 +----------------- .../sglang/srt/managers/tokenizer_manager.py | 2 +- python/sglang/srt/managers/tp_worker.py | 16 ++ .../sglang/srt/model_executor/model_runner.py | 3 - .../srt/sampling/sampling_batch_info.py | 136 ++++++++++++ .../srt/{ => sampling}/sampling_params.py | 0 10 files changed, 269 insertions(+), 195 deletions(-) create mode 100644 python/sglang/srt/layers/sampler.py create mode 100644 python/sglang/srt/sampling/sampling_batch_info.py rename python/sglang/srt/{ => sampling}/sampling_params.py (100%) diff --git a/examples/usage/json_decode.py b/examples/usage/json_decode.py index dc34d3527b..ce8f5ba706 100644 --- a/examples/usage/json_decode.py +++ b/examples/usage/json_decode.py @@ -35,6 +35,9 @@ def character_gen(s, name): name + " is a character in Harry Potter. Please fill in the following information about this character.\n" ) + s += "The constrained regex is:\n" + s += character_regex + "\n" + s += "The JSON output is:\n" s += sgl.gen("json_output", max_tokens=256, regex=character_regex) diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index dd86747e36..d9131c87f0 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -54,7 +54,7 @@ from sglang.srt.model_config import ModelConfig from sglang.srt.model_executor.forward_batch_info import ForwardMode from sglang.srt.model_executor.model_runner import ModelRunner -from sglang.srt.sampling_params import SamplingParams +from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.server_args import ServerArgs from sglang.srt.utils import suppress_other_loggers diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py new file mode 100644 index 0000000000..3006e765c8 --- /dev/null +++ b/python/sglang/srt/layers/sampler.py @@ -0,0 +1,101 @@ +import logging + +import torch +from flashinfer.sampling import ( + min_p_sampling_from_probs, + top_k_renorm_prob, + top_k_top_p_sampling_from_probs, + top_p_renorm_prob, +) +from vllm.model_executor.custom_op import CustomOp + +# TODO: move this dict to another place +from sglang.srt.managers.schedule_batch import global_server_args_dict +from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo + +logger = logging.getLogger(__name__) + + +class Sampler(CustomOp): + def __init__(self): + super().__init__() + + def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): + # Post process logits + logits = logits.contiguous() + logits.div_(sampling_info.temperatures) + if sampling_info.logit_bias is not None: + logits.add_(sampling_info.logit_bias) + + if sampling_info.vocab_mask is not None: + logits = logits.masked_fill(~sampling_info.vocab_mask, float("-inf")) + + logits = sampling_info.penalizer_orchestrator.apply(logits) + + probs = torch.softmax(logits, dim=-1) + + if not global_server_args_dict["disable_flashinfer_sampling"]: + max_top_k_round, batch_size = 32, probs.shape[0] + uniform_samples = torch.rand( + (max_top_k_round, batch_size), device=probs.device + ) + if sampling_info.min_ps.any(): + probs = top_k_renorm_prob(probs, sampling_info.top_ks) + probs = top_p_renorm_prob(probs, sampling_info.top_ps) + batch_next_token_ids, success = min_p_sampling_from_probs( + probs, uniform_samples, sampling_info.min_ps + ) + else: + batch_next_token_ids, success = top_k_top_p_sampling_from_probs( + probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps + ) + else: + # Here we provide a slower fallback implementation. + batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch( + probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps + ) + + if not torch.all(success): + logging.warning("Sampling failed, fallback to top_k=1 strategy") + probs = probs.masked_fill(torch.isnan(probs), 0.0) + argmax_ids = torch.argmax(probs, dim=-1) + batch_next_token_ids = torch.where( + success, batch_next_token_ids, argmax_ids + ) + + return batch_next_token_ids + + def forward_native(): + raise NotImplementedError("Native forward is not implemented yet.") + + +def top_k_top_p_min_p_sampling_from_probs_torch( + probs: torch.Tensor, + top_ks: torch.Tensor, + top_ps: torch.Tensor, + min_ps: torch.Tensor, +): + """A top-k, top-p and min-p sampling implementation with native pytorch operations.""" + probs_sort, probs_idx = probs.sort(dim=-1, descending=True) + probs_sum = torch.cumsum(probs_sort, dim=-1) + min_p_thresholds = probs_sort[:, 0] * min_ps + probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0 + probs_sort[ + torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1) + >= top_ks.view(-1, 1) + ] = 0.0 + probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0 + probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) + try: + sampled_index = torch.multinomial(probs_sort, num_samples=1) + except RuntimeError as e: + logger.warning(f"Sampling error: {e}") + batch_next_token_ids = torch.zeros( + (probs_sort.shape[0],), dtype=torch.int32, device=probs.device + ) + success = torch.zeros(probs.shape[0], dtype=torch.bool, device=probs.device) + return batch_next_token_ids, success + + batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1) + success = torch.ones(probs.shape[0], dtype=torch.bool, device=probs.device) + return batch_next_token_ids, success diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index dc82245931..56e3d8f799 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -23,7 +23,7 @@ from typing import Dict, List, Optional, Union from sglang.srt.managers.schedule_batch import BaseFinishReason -from sglang.srt.sampling_params import SamplingParams +from sglang.srt.sampling.sampling_params import SamplingParams @dataclass diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9abce6f9b9..88a6168325 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -20,22 +20,14 @@ from typing import List, Optional, Union import torch -import torch.distributed as dist -from flashinfer.sampling import ( - min_p_sampling_from_probs, - top_k_renorm_prob, - top_k_top_p_sampling_from_probs, - top_p_renorm_prob, -) -from vllm.distributed import get_tensor_model_parallel_group - -import sglang.srt.sampling.penaltylib as penaltylib + from sglang.global_config import global_config from sglang.srt.constrained import RegexGuide from sglang.srt.constrained.jump_forward import JumpForwardMap from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache from sglang.srt.mem_cache.chunk_cache import ChunkCache from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool +from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 @@ -340,14 +332,6 @@ class ScheduleBatch: return_logprob: bool = False top_logprobs_nums: List[int] = None - # Batched sampling params - temperatures: torch.Tensor = None - top_ps: torch.Tensor = None - top_ks: torch.Tensor = None - min_ps: torch.Tensor = None - penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None - logit_bias: torch.Tensor = None - @classmethod def init_new(cls, reqs, req_to_token_pool, token_to_kv_pool, tree_cache): return_logprob = any(req.return_logprob for req in reqs) @@ -395,46 +379,6 @@ def alloc_token_slots(self, num_tokens: int): return out_cache_loc - def batch_sampling_params(self, vocab_size): - device = "cuda" - bs, reqs = self.batch_size(), self.reqs - self.temperatures = torch.tensor( - [r.sampling_params.temperature for r in reqs], - dtype=torch.float, - device=device, - ).view(-1, 1) - self.top_ps = torch.tensor( - [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device - ) - self.top_ks = torch.tensor( - [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device - ) - self.min_ps = torch.tensor( - [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device - ) - - # Each penalizers will do nothing if they evaluate themselves as not required by looking at - # the sampling_params of the requests (See {_is_required()} of each penalizers). So this - # should not add hefty computation overhead other than simple checks. - # - # While we choose not to even create the class instances if they are not required, this - # could add additional complexity to the {ScheduleBatch} class, especially we need to - # handle {filter_batch()} and {merge()} cases as well. - self.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator( - vocab_size=vocab_size, - batch=self, - device=device, - Penalizers={ - penaltylib.BatchedFrequencyPenalizer, - penaltylib.BatchedMinNewTokensPenalizer, - penaltylib.BatchedPresencePenalizer, - penaltylib.BatchedRepetitionPenalizer, - }, - ) - - # Handle logit bias but only allocate when needed - self.logit_bias = None - def prepare_for_extend(self, vocab_size: int): bs = self.batch_size() reqs = self.reqs @@ -475,7 +419,7 @@ def prepare_for_extend(self, vocab_size: int): self.top_logprobs_nums = [r.top_logprobs_num for r in reqs] self.prefix_lens_cpu = [len(r.prefix_indices) for r in reqs] - self.batch_sampling_params(vocab_size) + self.sampling_info = SamplingBatchInfo.from_schedule_batch(self, vocab_size) def mix_with_running(self, running_batch: "ScheduleBatch"): # NOTE: prefix_indices is what has been cached, but we don't cache each decode step @@ -684,6 +628,8 @@ def prepare_for_decode(self, input_ids=None): self.req_pool_indices, self.seq_lens - 1 ] = self.out_cache_loc + self.sampling_info.update_regex_vocab_mask(self) + def filter_batch(self, unfinished_indices: List[int]): if unfinished_indices is None or len(unfinished_indices) == 0: # Filter out all requests @@ -704,24 +650,13 @@ def filter_batch(self, unfinished_indices: List[int]): self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in unfinished_indices] self.return_logprob = any(req.return_logprob for req in self.reqs) - self.penalizer_orchestrator.filter(unfinished_indices, new_indices) - - for item in [ - "temperatures", - "top_ps", - "top_ks", - "min_ps", - "logit_bias", - ]: - self_val = getattr(self, item, None) - if self_val is not None: # logit_bias can be None - setattr(self, item, self_val[new_indices]) + self.sampling_info.filter(unfinished_indices, new_indices) def merge(self, other: "ScheduleBatch"): # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it # needs to be called with pre-merged Batch.reqs. - self.penalizer_orchestrator.merge(other.penalizer_orchestrator) + self.sampling_info.merge(other.sampling_info) self.reqs.extend(other.reqs) @@ -736,125 +671,11 @@ def merge(self, other: "ScheduleBatch"): self.top_logprobs_nums.extend(other.top_logprobs_nums) self.return_logprob = any(req.return_logprob for req in self.reqs) - for item in [ - "temperatures", - "top_ps", - "top_ks", - "min_ps", - ]: - self_val = getattr(self, item, None) - other_val = getattr(other, item, None) - setattr(self, item, torch.concat([self_val, other_val])) - - # logit_bias can be None - if self.logit_bias is not None or other.logit_bias is not None: - vocab_size = ( - self.logit_bias.shape[1] - if self.logit_bias is not None - else other.logit_bias.shape[1] - ) - if self.logit_bias is None: - self.logit_bias = torch.zeros( - (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda" - ) - if other.logit_bias is None: - other.logit_bias = torch.zeros( - (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda" - ) - self.logit_bias = torch.concat([self.logit_bias, other.logit_bias]) - def sample(self, logits: torch.Tensor): - # TODO(lsyin): move this into a part of layer and run with CUDA Graph - # Post process logits - logits = logits.contiguous() - logits.div_(self.temperatures) - if self.logit_bias is not None: - logits.add_(self.logit_bias) - - has_regex = any(req.regex_fsm is not None for req in self.reqs) - if has_regex: - allowed_mask = torch.empty_like(logits[0], dtype=torch.bool) - for i, req in enumerate(self.reqs): - if req.regex_fsm is not None: - allowed_mask.zero_() - allowed_mask[ - req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens - ] = 1 - logits[i].masked_fill_(~allowed_mask, float("-inf")) - - logits = self.penalizer_orchestrator.apply(logits) - - probs = torch.softmax(logits, dim=-1) - - if not global_server_args_dict["disable_flashinfer_sampling"]: - max_top_k_round, batch_size = 32, probs.shape[0] - uniform_samples = torch.rand( - (max_top_k_round, batch_size), device=probs.device - ) - if self.min_ps.any(): - probs = top_k_renorm_prob(probs, self.top_ks) - probs = top_p_renorm_prob(probs, self.top_ps) - batch_next_token_ids, success = min_p_sampling_from_probs( - probs, uniform_samples, self.min_ps - ) - else: - batch_next_token_ids, success = top_k_top_p_sampling_from_probs( - probs, uniform_samples, self.top_ks, self.top_ps - ) - else: - # Here we provide a slower fallback implementation. - batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch( - probs, self.top_ks, self.top_ps, self.min_ps - ) + from sglang.srt.layers.sampler import Sampler - if not torch.all(success): - logger.warning(f"Sampling failed. Fallback to top_k=1 strategy. {logits=}") - probs = probs.masked_fill(torch.isnan(probs), 0.0) - argmax_ids = torch.argmax(probs, dim=-1) - batch_next_token_ids = torch.where( - success, batch_next_token_ids, argmax_ids - ) - - if has_regex: - batch_next_token_ids_cpu = batch_next_token_ids.cpu().numpy() - for i, req in enumerate(self.reqs): - if req.regex_fsm is not None: - req.regex_fsm_state = req.regex_fsm.get_next_state( - req.regex_fsm_state, batch_next_token_ids_cpu[i] - ) + sampler = Sampler() - self.penalizer_orchestrator.cumulate_output_tokens(batch_next_token_ids) + batch_next_token_ids = sampler(logits, self.sampling_info) return batch_next_token_ids - - -def top_k_top_p_min_p_sampling_from_probs_torch( - probs: torch.Tensor, - top_ks: torch.Tensor, - top_ps: torch.Tensor, - min_ps: torch.Tensor, -): - """A top-k, top-p and min-p sampling implementation with native pytorch operations.""" - probs_sort, probs_idx = probs.sort(dim=-1, descending=True) - probs_sum = torch.cumsum(probs_sort, dim=-1) - min_p_thresholds = probs_sort[:, 0] * min_ps - probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0 - probs_sort[ - torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1) - >= top_ks.view(-1, 1) - ] = 0.0 - probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0 - probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) - try: - sampled_index = torch.multinomial(probs_sort, num_samples=1) - except RuntimeError as e: - logger.warning(f"Sampling error: {e}") - batch_next_token_ids = torch.zeros( - (probs_sort.shape[0],), dtype=torch.int32, device=probs.device - ) - success = torch.zeros(probs.shape[0], dtype=torch.bool, device=probs.device) - return batch_next_token_ids, success - - batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1) - success = torch.ones(probs.shape[0], dtype=torch.bool, device=probs.device) - return batch_next_token_ids, success diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index ab375a39a9..32d1f43d38 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -50,7 +50,7 @@ UpdateWeightReqOutput, ) from sglang.srt.mm_utils import expand2square, process_anyres_image -from sglang.srt.sampling_params import SamplingParams +from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.utils import is_generation_model, is_multimodal_model, load_image from sglang.utils import get_exception_traceback diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 8772a4abbb..41f9083012 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -482,6 +482,9 @@ def forward_prefill_batch(self, batch: ScheduleBatch): if batch.extend_num_tokens != 0: output = self.model_runner.forward(batch, ForwardMode.EXTEND) next_token_ids = batch.sample(output.next_token_logits) + batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( + next_token_ids + ) # Move logprobs to cpu if output.next_token_logprobs is not None: @@ -514,6 +517,11 @@ def forward_prefill_batch(self, batch: ScheduleBatch): req.output_ids.append(next_token_ids[i]) req.check_finished() + if req.regex_fsm is not None: + req.regex_fsm_state = req.regex_fsm.get_next_state( + req.regex_fsm_state, next_token_ids[i] + ) + if req.finished(): self.tree_cache.cache_finished_req(req) elif req not in decoding_reqs: @@ -642,6 +650,9 @@ def forward_decode_batch(self, batch: ScheduleBatch): # Forward and sample the next tokens output = self.model_runner.forward(batch, ForwardMode.DECODE) next_token_ids = batch.sample(output.next_token_logits) + batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( + next_token_ids + ) # Move logprobs to cpu if output.next_token_logprobs is not None: @@ -658,6 +669,11 @@ def forward_decode_batch(self, batch: ScheduleBatch): req.output_ids.append(next_token_id) req.check_finished() + if req.regex_fsm is not None: + req.regex_fsm_state = req.regex_fsm.get_next_state( + req.regex_fsm_state, next_token_id + ) + if req.finished(): self.tree_cache.cache_finished_req(req) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index a00a73945c..b91191c5dc 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -120,9 +120,6 @@ def __init__( self.gpu_id, distributed=self.tp_size > 1 ) self.tp_group = get_tp_group() - self.is_multi_node_tp = not all( - in_the_same_node_as(self.tp_group.cpu_group, source_rank=0) - ) if self.tp_size > 1: total_local_gpu_memory = get_available_gpu_memory(self.gpu_id) diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py new file mode 100644 index 0000000000..bc70a9018e --- /dev/null +++ b/python/sglang/srt/sampling/sampling_batch_info.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import dataclasses +from typing import TYPE_CHECKING, List + +import torch + +import sglang.srt.sampling.penaltylib as penaltylib + +if TYPE_CHECKING: + from sglang.srt.managers.schedule_batch import ScheduleBatch + + +@dataclasses.dataclass +class SamplingBatchInfo: + # Basic Info + vocab_size: int + + # Batched sampling params + temperatures: torch.Tensor = None + top_ps: torch.Tensor = None + top_ks: torch.Tensor = None + min_ps: torch.Tensor = None + penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None + logit_bias: torch.Tensor = None + vocab_mask: torch.Tensor = None + + @classmethod + def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): + device = "cuda" + reqs = batch.reqs + ret = cls(vocab_size=vocab_size) + + ret.temperatures = torch.tensor( + [r.sampling_params.temperature for r in reqs], + dtype=torch.float, + device=device, + ).view(-1, 1) + ret.top_ps = torch.tensor( + [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device + ) + ret.top_ks = torch.tensor( + [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device + ) + ret.min_ps = torch.tensor( + [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device + ) + + # Each penalizers will do nothing if they evaluate themselves as not required by looking at + # the sampling_params of the requests (See {_is_required()} of each penalizers). So this + # should not add hefty computation overhead other than simple checks. + # + # While we choose not to even create the class instances if they are not required, this + # could add additional complexity to the {ScheduleBatch} class, especially we need to + # handle {filter_batch()} and {merge()} cases as well. + ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator( + vocab_size=vocab_size, + batch=batch, + device=device, + Penalizers={ + penaltylib.BatchedFrequencyPenalizer, + penaltylib.BatchedMinNewTokensPenalizer, + penaltylib.BatchedPresencePenalizer, + penaltylib.BatchedRepetitionPenalizer, + }, + ) + + # Handle logit bias but only allocate when needed + ret.logit_bias = None + + ret.update_regex_vocab_mask(batch) + + return ret + + def update_regex_vocab_mask(self, batch: ScheduleBatch): + bs, reqs = batch.batch_size(), batch.reqs + device = "cuda" + has_regex = any(req.regex_fsm is not None for req in reqs) + + # Reset the vocab mask + self.vocab_mask = None + + if has_regex: + for i, req in enumerate(reqs): + if req.regex_fsm is not None: + if self.vocab_mask is None: + self.vocab_mask = torch.zeros( + bs, self.vocab_size, dtype=torch.bool, device=device + ) + self.vocab_mask[i][ + req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens + ] = 1 + + def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor): + self.penalizer_orchestrator.filter(unfinished_indices, new_indices) + + for item in [ + "temperatures", + "top_ps", + "top_ks", + "min_ps", + "logit_bias", + ]: + self_val = getattr(self, item, None) + if self_val is not None: # logit_bias can be None + setattr(self, item, self_val[new_indices]) + + def merge(self, other: "SamplingBatchInfo"): + self.penalizer_orchestrator.merge(other.penalizer_orchestrator) + + for item in [ + "temperatures", + "top_ps", + "top_ks", + "min_ps", + ]: + self_val = getattr(self, item, None) + other_val = getattr(other, item, None) + setattr(self, item, torch.concat([self_val, other_val])) + + # logit_bias can be None + if self.logit_bias is not None or other.logit_bias is not None: + vocab_size = ( + self.logit_bias.shape[1] + if self.logit_bias is not None + else other.logit_bias.shape[1] + ) + if self.logit_bias is None: + self.logit_bias = torch.zeros( + (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda" + ) + if other.logit_bias is None: + other.logit_bias = torch.zeros( + (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda" + ) + self.logit_bias = torch.concat([self.logit_bias, other.logit_bias]) diff --git a/python/sglang/srt/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py similarity index 100% rename from python/sglang/srt/sampling_params.py rename to python/sglang/srt/sampling/sampling_params.py From 5623826f7363e41f97db2cfe6e7f1244d9222d35 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 21 Aug 2024 19:24:36 -0700 Subject: [PATCH 067/118] [Minor] Improve logging and rename the health check endpoint name (#1180) --- .../sglang/srt/managers/controller_multi.py | 1 - .../sglang/srt/managers/controller_single.py | 1 - .../srt/managers/detokenizer_manager.py | 4 ++++ .../sglang/srt/managers/tokenizer_manager.py | 10 +++++----- python/sglang/srt/server.py | 20 +++++++++---------- python/sglang/srt/server_args.py | 15 +++----------- 6 files changed, 21 insertions(+), 30 deletions(-) diff --git a/python/sglang/srt/managers/controller_multi.py b/python/sglang/srt/managers/controller_multi.py index 58c4f4484a..38229cd466 100644 --- a/python/sglang/srt/managers/controller_multi.py +++ b/python/sglang/srt/managers/controller_multi.py @@ -21,7 +21,6 @@ import dataclasses import logging import multiprocessing -import os from enum import Enum, auto import numpy as np diff --git a/python/sglang/srt/managers/controller_single.py b/python/sglang/srt/managers/controller_single.py index a3402c62f7..422db943f6 100644 --- a/python/sglang/srt/managers/controller_single.py +++ b/python/sglang/srt/managers/controller_single.py @@ -17,7 +17,6 @@ import logging import multiprocessing -import os from typing import List import zmq diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index e1402795fb..9a4306372b 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -39,6 +39,8 @@ @dataclasses.dataclass class DecodeStatus: + """Store the status of incremental decoding.""" + vid: int decoded_text: str decode_ids: List[int] @@ -47,6 +49,8 @@ class DecodeStatus: class DetokenizerManager: + """DetokenizerManager is a process that detokenizes the token ids.""" + def __init__( self, server_args: ServerArgs, diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 32d1f43d38..328519cb26 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -62,12 +62,16 @@ @dataclasses.dataclass class ReqState: + """Store the state a request.""" + out_list: List finished: bool event: asyncio.Event class TokenizerManager: + """TokenizerManager is a process that tokenizes the text.""" + def __init__( self, server_args: ServerArgs, @@ -481,11 +485,7 @@ async def _wait_for_response( # Log requests if self.server_args.log_requests and state.finished: - if obj.text is None: - in_obj = {"input_ids": obj.input_ids} - else: - in_obj = {"text": obj.text} - logger.info(f"in={in_obj}, out={out}") + logger.info(f"in={obj}, out={out}") state.out_list = [] if state.finished: diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index fbe3374df7..3ec5cd633f 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -92,11 +92,15 @@ tokenizer_manager = None -@app.get("/v1/health") -async def health(request: Request) -> Response: - """ - Generate 1 token to verify the health of the inference service. - """ +@app.get("/health") +async def health() -> Response: + """Check the health of the http server.""" + return Response(status_code=200) + + +@app.get("/health_generate") +async def health_generate(request: Request) -> Response: + """Check the health of the inference server by generating one token.""" gri = GenerateReqInput( text="s", sampling_params={"max_new_tokens": 1, "temperature": 0.7} ) @@ -109,12 +113,6 @@ async def health(request: Request) -> Response: return Response(status_code=503) -@app.get("/health") -async def health() -> Response: - """Health check.""" - return Response(status_code=200) - - @app.get("/get_model_info") async def get_model_info(): result = { diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 4dd5baccac..33451d645e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -422,13 +422,13 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--enable-mla", action="store_true", - help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2", + help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.", ) parser.add_argument( "--attention-reduce-in-fp32", action="store_true", help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." - "This only affects Triton attention kernels", + "This only affects Triton attention kernels.", ) parser.add_argument( "--efficient-weight-load", @@ -452,15 +452,6 @@ def from_cli_args(cls, args: argparse.Namespace): def url(self): return f"http://{self.host}:{self.port}" - def print_mode_args(self): - return ( - f"disable_flashinfer={self.disable_flashinfer}, " - f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, " - f"disable_radix_cache={self.disable_radix_cache}, " - f"disable_regex_jump_forward={self.disable_regex_jump_forward}, " - f"disable_disk_cache={self.disable_disk_cache}, " - ) - def check_server_args(self): assert ( self.tp_size % self.nnodes == 0 @@ -469,7 +460,7 @@ def check_server_args(self): self.dp_size > 1 and self.node_rank is not None ), "multi-node data parallel is not supported" if "gemma-2" in self.model_path.lower(): - logger.info(f"When using sliding window in gemma-2, turn on flashinfer.") + logger.info("When using sliding window in gemma-2, turn on flashinfer.") self.disable_flashinfer = False From 364d3d72a78ba4ce3b0cfde7e28e40d91679cb8e Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Thu, 22 Aug 2024 01:16:35 -0700 Subject: [PATCH 068/118] Fix broken penalty (#1184) --- python/sglang/srt/managers/schedule_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 88a6168325..75c33bb8b4 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -615,7 +615,7 @@ def prepare_for_decode(self, input_ids=None): for r in self.reqs ] else: - self.penalizer_orchestrator.cumulate_input_tokens(input_ids) + self.sampling_info.penalizer_orchestrator.cumulate_input_tokens(input_ids) self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device="cuda") self.seq_lens.add_(1) From 5fafcac00834253a18a3f10551dfc8221fcc360b Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Thu, 22 Aug 2024 02:03:25 -0700 Subject: [PATCH 069/118] Fix benchmark script (#1185) --- python/sglang/bench_latency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index d9131c87f0..ba1a81d54d 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -350,7 +350,7 @@ def latency_test( for bs, il, ol in itertools.product( bench_args.batch_size, bench_args.input_len, bench_args.output_len ): - req = prepare_synthetic_inputs_for_latency_test(bs, il) + reqs = prepare_synthetic_inputs_for_latency_test(bs, il) ret = latency_test_run_once( bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol ) From a5b14ad04337a3371ca2513ef95a5add28b3f34d Mon Sep 17 00:00:00 2001 From: Kaichen Zhang - NTU Date: Sat, 24 Aug 2024 05:11:16 +0800 Subject: [PATCH 070/118] [Feat/WIP] add llava-onevision, with support for (1) siglip encoder, (2) qwen2 decoder (3) openai api compatible server. (#1123) Co-authored-by: Bo Li --- README.md | 5 + .../usage/llava/http_llava_onevision_test.py | 211 ++++++++++++++++ .../usage/llava_video/srt_example_llava_v.py | 16 +- .../llava_video/videos/Q98Z4OTh8RwmDonc.mp4 | Bin 316390 -> 0 bytes python/pyproject.toml | 2 +- python/sglang/lang/chat_template.py | 11 +- python/sglang/srt/conversation.py | 51 +++- .../sglang/srt/managers/tokenizer_manager.py | 50 +++- python/sglang/srt/managers/tp_worker.py | 13 +- python/sglang/srt/mm_utils.py | 86 ++++++- .../srt/model_executor/forward_batch_info.py | 21 +- python/sglang/srt/models/llava.py | 235 +++++++++++++----- test/srt/test_vision_openai_server.py | 93 ++++++- 13 files changed, 701 insertions(+), 93 deletions(-) create mode 100644 examples/usage/llava/http_llava_onevision_test.py delete mode 100644 examples/usage/llava_video/videos/Q98Z4OTh8RwmDonc.mp4 diff --git a/README.md b/README.md index c7d47d6786..c118d6a1a0 100644 --- a/README.md +++ b/README.md @@ -231,8 +231,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000` + - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --host=127.0.0.1 --tp-size=1 --chat-template=llava_llama_3` + - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --host="127.0.0.1" --tp-size=8 --chat-template=chatml-llava` - LLaVA-NeXT-Video - see [examples/usage/llava_video](examples/usage/llava_video) +- [LLaVA-OneVision](https://arxiv.org/abs/2408.03326) + - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384` + - see [test/srt/test_llava_onevision_openai_server.py](test/srt/test_llava_onevision_openai_server.py) - Yi-VL - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py). - StableLM diff --git a/examples/usage/llava/http_llava_onevision_test.py b/examples/usage/llava/http_llava_onevision_test.py new file mode 100644 index 0000000000..c32d529819 --- /dev/null +++ b/examples/usage/llava/http_llava_onevision_test.py @@ -0,0 +1,211 @@ +import base64 +import io +import os +import sys +import time + +import numpy as np +import openai +import requests +from decord import VideoReader, cpu +from PIL import Image + +# pip install httpx==0.23.3 +# pip install decord +# pip install protobuf==3.20.0 + + +def download_video(url, cache_dir): + file_path = os.path.join(cache_dir, "jobs.mp4") + os.makedirs(cache_dir, exist_ok=True) + + response = requests.get(url) + response.raise_for_status() + + with open(file_path, "wb") as f: + f.write(response.content) + + print(f"File downloaded and saved to: {file_path}") + return file_path + + +def create_openai_client(base_url): + return openai.Client(api_key="EMPTY", base_url=base_url) + + +def image_stream_request_test(client): + print("----------------------Image Stream Request Test----------------------") + stream_request = client.chat.completions.create( + model="default", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" + }, + }, + { + "type": "text", + "text": "Please describe this image. Please list the benchmarks and the models.", + }, + ], + }, + ], + temperature=0.7, + max_tokens=1024, + stream=True, + ) + stream_response = "" + + for chunk in stream_request: + if chunk.choices[0].delta.content is not None: + content = chunk.choices[0].delta.content + stream_response += content + sys.stdout.write(content) + sys.stdout.flush() + + print("-" * 30) + + +def video_stream_request_test(client, video_path): + print("------------------------Video Stream Request Test----------------------") + messages = prepare_video_messages(video_path) + + start_time = time.time() + video_request = client.chat.completions.create( + model="default", + messages=messages, + temperature=0, + max_tokens=1024, + stream=True, + ) + print("-" * 30) + video_response = "" + + for chunk in video_request: + if chunk.choices[0].delta.content is not None: + content = chunk.choices[0].delta.content + video_response += content + sys.stdout.write(content) + sys.stdout.flush() + print("-" * 30) + + +def image_speed_test(client): + print("----------------------Image Speed Test----------------------") + start_time = time.time() + request = client.chat.completions.create( + model="default", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" + }, + }, + { + "type": "text", + "text": "Please describe this image. Please list the benchmarks and the models.", + }, + ], + }, + ], + temperature=0, + max_tokens=1024, + ) + end_time = time.time() + response = request.choices[0].message.content + print(response) + print("-" * 30) + print_speed_test_results(request, start_time, end_time) + + +def video_speed_test(client, video_path): + print("------------------------Video Speed Test------------------------") + messages = prepare_video_messages(video_path) + + start_time = time.time() + video_request = client.chat.completions.create( + model="default", + messages=messages, + temperature=0, + max_tokens=1024, + ) + end_time = time.time() + video_response = video_request.choices[0].message.content + print(video_response) + print("-" * 30) + print_speed_test_results(video_request, start_time, end_time) + + +def prepare_video_messages(video_path): + max_frames_num = 32 + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace( + 0, total_frame_num - 1, max_frames_num, dtype=int + ) + frame_idx = uniform_sampled_frames.tolist() + frames = vr.get_batch(frame_idx).asnumpy() + + base64_frames = [] + for frame in frames: + pil_img = Image.fromarray(frame) + buff = io.BytesIO() + pil_img.save(buff, format="JPEG") + base64_str = base64.b64encode(buff.getvalue()).decode("utf-8") + base64_frames.append(base64_str) + + messages = [{"role": "user", "content": []}] + frame_format = { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,{}"}, + } + + for base64_frame in base64_frames: + frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format( + base64_frame + ) + messages[0]["content"].append(frame_format.copy()) + + prompt = {"type": "text", "text": "Please describe the video in detail."} + messages[0]["content"].append(prompt) + + return messages + + +def print_speed_test_results(request, start_time, end_time): + total_tokens = request.usage.total_tokens + completion_tokens = request.usage.completion_tokens + prompt_tokens = request.usage.prompt_tokens + + print(f"Total tokens: {total_tokens}") + print(f"Completion tokens: {completion_tokens}") + print(f"Prompt tokens: {prompt_tokens}") + print(f"Time taken: {end_time - start_time} seconds") + print(f"Token per second: {total_tokens / (end_time - start_time)}") + print(f"Completion token per second: {completion_tokens / (end_time - start_time)}") + print(f"Prompt token per second: {prompt_tokens / (end_time - start_time)}") + + +def main(): + url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" + cache_dir = os.path.expanduser("~/.cache") + video_path = download_video(url, cache_dir) + + client = create_openai_client("http://127.0.0.1:30000/v1") + + image_stream_request_test(client) + video_stream_request_test(client, video_path) + image_speed_test(client) + video_speed_test(client, video_path) + + +if __name__ == "__main__": + main() diff --git a/examples/usage/llava_video/srt_example_llava_v.py b/examples/usage/llava_video/srt_example_llava_v.py index 27ba862d30..7421dfcdfb 100644 --- a/examples/usage/llava_video/srt_example_llava_v.py +++ b/examples/usage/llava_video/srt_example_llava_v.py @@ -121,6 +121,20 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= if __name__ == "__main__": + url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" + + cache_dir = os.path.expanduser("~/.cache") + file_path = os.path.join(cache_dir, "jobs.mp4") + + os.makedirs(cache_dir, exist_ok=True) + + response = requests.get(url) + response.raise_for_status() # Raise an exception for bad responses + + with open(file_path, "wb") as f: + f.write(response.content) + + print(f"File downloaded and saved to: {file_path}") # Create the parser parser = argparse.ArgumentParser( description="Run video processing with specified port." @@ -148,7 +162,7 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= parser.add_argument( "--video-dir", type=str, - default="./videos/Q98Z4OTh8RwmDonc.mp4", + default=os.path.expanduser("~/.cache/jobs.mp4"), help="The directory or path for the processed video files.", ) parser.add_argument( diff --git a/examples/usage/llava_video/videos/Q98Z4OTh8RwmDonc.mp4 b/examples/usage/llava_video/videos/Q98Z4OTh8RwmDonc.mp4 deleted file mode 100644 index 32d912dbfa17c8426906ca1163b2e5e30fa84fef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 316390 zcmeFX1yoht*XX^^p%g??x=WC5q#LA>QbD>yy1PL{Bot|+krJdNrKP(OBqXFuLgHJf z&;9#6@3{B>z2m+0jq&Z_@60vlnsctX)?Ry`v)MurL}2RVZfD_WYXd`DT-Ac#cB z#@5ya*sN__%#DGB7MSu;uDE{tBji$<4!nQgeS zdN5HS<;%;(t^ZW$7G3^W-?SQ<>eg&?flYIAS6I667~Jg2{WV8M_Q1!$suGy?WtWx=lY*~`_JlsUC;m7xPSWp@1DoM8}F}q{X4(^YXAT0b@<;kkAJrg z|D5MP^?w)tKklb6aTy~_Ddpc|B{G> zzxDRtlKGc}GyIlnK;r&tU*-Ba2ckc8_P_T-Ne~ouwZ{4X^7$DDYJjl+WrJUxt6RTx z__g0s;g^KX{+9i}?cPA*{c8Wrb^6;L0OTLFpnL%O?SJ_pM*V7|T&?HtcqF}FHpJg+ zhICawm=}W9FNypFNUC4$pZ2Rg3Hsqz`{T+FoX0=c1L5}=Na(=l^DiI7t8)u{5p{mY z{GK1e)w%y;Eq~_xQ_uQ)KMd@PKmB1>d-4zeS72YjOn?oH^%pSTUu}e?UlNfE$loyt zy#Hz=U+w2V*7WB-!2cEVbDsPE^8GoNf9wM!^?&sv#Qm#%<%=}`5B=HSbwt*`>j*x7 z*WqA)|5fwPTBQE2qbvjcYJG8lyyEx#5$yl&2fy0?zvm9W+5^AaSN?Et&HkywYX71C zwO0*({+$AXzJdPV<6KGD6wrVBz~lcO3%^?XKgNQ2{E9`q`cu>&^}m1SNudNNO?!U0v>P905NdC>QLjTu@UH zxbGKVKu`%--*<2bs(J(dG$#TtY3*(@hl! z+6C<+P|FdR$C(!R=JbSMh~76qt6 zFiu5)E(GHRk{3upQ~)sq6PX9>K`;q&02>68dI+$DU^4XpkVgjOk=ch}_q+hF0J(r_ z2qw1%IEP^JHvu44z6OF#heEI!kYh#+a35dl0$`!WgG0$f6{MHB!5 z1p8_XaE4$jY5>q);{-@Tu=OUuIA9fU0>L&x9b1SHY$pVe4k!gQL$G78CdV0o_kdOi zb`J8K8$)o!Vn8ck6oR920i+=KO$~r41jmU7iF1Q({DZ*#bU$$N(_T9wP)l)(3$8<50kB2!1XJ&<0oo zydeab9Uuts5|9HSAe}%6$mjqv04s!m76^z32iD zSOGa#Ky53awpFlxt5p!fMgw3FumISH5Vku2(|`@YIfSrJ0RTDoLC$?m2;qzqAPFH{ zWCJQ8L_{@!F@$)H9KZ%4qTPWIF~$HZ5F*wogoup>A>siaJm7;r3Rs2^iG={F07HNy zgh+-8AOo-hL?J{95KjT(sjLCs5F*V-z#xQ358~)S96gAm2XS{IAVlVDKsBHfFa;s9 zeg^D9h-`2G4ur@-3lIb-0(2on&PBih_^E6U2m-tWuS`CWupO3TLBzzNjy-+P4wx_-Yyg6|?D zU0!a^NUhZ`Ps&!kVw#|sq~6+g>-K=WcCc>O{f8#>UCa!N$P>jOI>GcKocYuCA^uS1;{s ztqq>A*gBZ8UX8+H?qp*Pd~EHUENq`R@>3WY7#bJ}u~UFoazfk`#wLc=wnkP$?EGx} zY!n7h46NN9O@!FofbPc5&Q4)tB4lpjM&ame2wEHzc8=~sY!o&YZYIWhR|&2h^c)PH zn3)K%^H3O>JJ{M7=mAf53MU5>YikQf(BgCBGd6MpVMg{g;M3T|z}Um~3CPCI1O_lQ zaCFkMbF{Ls1FpXl*xTvZnwmPATm?CqJAeR3Ax;WwTU#pwbD-$`Wn-ssw6-t;rTk@J zqj=)*a|j~~8w01SGAy2eSC-ZWzzNt5t(_eV-1UrXZR`xLayvR%I2oB67+PO>8iCaT zuVO8pfP5gz!Qd*u)WN{UMwlD^ZV5nzcY+&aEVhr^REess5MznA+0dr)h za5b?oGj}osEn7R2CwgYKcA)o{(GG-KnYe@8Lg2Ni#S;rBli#`?c!w&)3F?Fj+c2&58!POc&m^hk)91cc$|7`)T`W=jfj2uiUY``l?lb_`~mK~q?yG)~ z2dt@qwVt_yvCtE6)+kIZtgTJ17TLlMoOrOOgxFa?IR^GZ9Q^DoY;0FQTvc05!h#Sk z(>_wXYraH!z#ksbj+#rQ``DLEaN0Y9I*K_FQ8PZ0$|3tOn)1y-A=LWuV_>YuJEdD1 zo~1WD4|L4UQUb;1?lCjHwJ1APD0}#FG~LxRCE|Jd4s6=M0~;h>yy{n9;WM$^g8p_1b^v9A!wmxVMTS%R`@aaaWS5o zT1 zFiyp7+2+Tz(~=$E=8#Y#54KLNr8UrZ)AC1iipkH>Z?3t0ApXu7n^IXg{KI2E)Q;OY zjzkYTQgC-+W$8X03(>CRsWf7nhQ#7La`s3hvbY`#z1>;WPWJOUx2#RZ$pG;eF4uz{ z67L|Q>8q#hAQ3%u5X&<0Mv3%e9csg^Jy(2aDOKMWn{<1JEWa`dBk)L0mHlDh{_g$K zi@j6%qHwuDNJA9D5fs!>%Lii3si2tU9mSr9~ z1rv%pk&@Dc+XU-<+N68$KGo;)Yd2KN-B8$k$l!cX{hINq8+IRFdpd1?ah8rSpQX+K zsULs5`=jn)e-!*L45ST+6WaHTpT6ojB3U$CVGt`q3V~#1*U!BOvr-o7od?{@&cx%x zk*Y*v<9#CeHcun_$wJxb^6J>|q^mmhT-5|y0yb6U1ugIfh~PMU=OL@Q@Oo6{FZVhd z>RLXTA2;Ye!$#4UwoAsa4pf*l+E%??u*bly#Vtt3#OJ=J|CA-l-Y=)bB>K3?)^nD7 zsL|p($;4X`X3m17UE=m>Z1c$OccWg>V^||*HATEDpXT1gQtJwy4Siy8eTRET_~JA; zO~OatyPu9r1H)-XeH*d+VhRLoTLSrdv*n5cuOmG|pRMvna$jr7RW zQAcqE6;-5fZ1})bF?kwmO*7;?*1mQ$#$UX)tY!(GvDI8&Ccfijm{*b|E>t^mtk?X> zx}_|Yc;7`_OWvctSzdzYrpfF3Qflm;{pU}-IOjSuli#`7lDV7% zyMkr#-?4O0 zk*=u`1oKVOM2*+#+!Xh%8a0_}kiB!CXNybdlhI18d2Cn_z8Cmt)*-T1ZP-KHd;LN5 zb+#AZcaYcf-~`MZcO@8SgSp7b*+MZNmgPiEV70pI5eUAw{A!Nh+wSsQ4q@Q#+n4Ks zuLKK^au$nY3<6W*LCJ%;3iEO8Fz*Hn6?Qj zwR0?9`P(?5hGDf)?OW4-MT;})S2&c3nTT$!aGNx3G&r}&yhW&ALNz&2!EG7$WjmG+ zpZ!kFZ9zik$;gwSA`60;+|GsV`WZ@lxx>fbi)AM=$_8vzr47ub+)oa&B099XK3acT zvdbu*kJXUYyW7puFw!b{m&M)Vx=!nTgt$3d0~O#wX~u(uyuF_KwRQOaq7> zvNjQ|`TTaAv#L|xk;UJbHx*$*vk%;8gd+Fg8VAjl0^?Sg?+I3uHw-hvi<$$?aN|5a z;6DEBC1)(wWi--ekjlkJOd=nLtcZeU*XsgjxAUf| zflaCpXG@=%KgZuXXI11bjDLQIW^TZcdW}!#k(pI+s8PSs>F1_9%4Gt9wE_<#b&z%s zeeUI87_er@tCE|m?Cyo0;~Y68xfqsj7Y`t3cw6PQC(g||6REu6fQiySx>tW=V|n$% zO`{L+J9j*b`MnwYg=k#%kDDwP3s)!!tMrF|aB#{gb-Ze(z3%@;$Ngv_%M4#65jK7;C<;fA=lo_Iv>e- zZLEKv|Go^8d4nPxEA(PHsawZTq^E25aW(TnuQT5?`-YPi`{W=$@<4=E(cEj%7u_pA z?y4{(bP=)p_~{zLYT<~l%6I;)cYR(bb{dKuz}Gnb6yADdjwu=i~v;Z)md(V>NNN-hABe8Muo(@ilggcwJ5x z|6^RN;iyM%fVKLj3^(5qx^~cQ%3TF7`~KYOwHmJ~thR*Iq7S=OhZ~R5GwEdzvhofe z&~=)3l<;FvNxd*>e%>yK&HnT@ljSP{!OaR^SXXY$^w#P@4BMJ|1{|3c^(Zc)hJbqP zlwDzJKJegpfRJVs__{_Jo=BBxJJ;aO$1 zYM#!m0tLdS)-A0iOMwNl z_)4V!4c6;N>vUmXD9>&#o*aH)ylp-yT^lqN2MQ- z`Z!6k3}q8%eSm*`Y}AIwRKz=7&RU6KbJ5`eV?>lG5!+mFK#isdRm3fclsSyXt=!zH zA%+azZVX31*2U%9NbOJGmsJH;R1PL2`>+_{Ba)mfJyHAeZgtgYPx}y+2}=tj1p!O& z?$q{Von?pt!kbh<=A3Mi?X1Bt;tyI=*<1V`xs}1?)#gJpE|HcKMKQ(CrqxQr1o<6j zB>lKimntXOSe6|8ddGTk2R)XLPAvz0hFRBaRlazzGbnuM#;{`EESWa625d51E|hIC}SjG#D;f~-k~EqyRxwQ z43XzEO!9x7p=JC`HM$Y^xZ?E7t6`lc{lwca+9I*?90LTGn+Iy1)Q!n`-r}D#PM|kR z^mI1{ut-EdzbkaPy!UnHrf6pbN`<}1%Sa?hxY3I)_r=>twUO4-7CV!U_%8f&xGF=l z4(u-E5zpyq&!3TzhPwL2Jsb7Kb}XjiHHEfd*gKy>ju)t1NBuotU4*aU%^31;Lgkgm^UDgF2KML*7HkuWb z&nakAzf5gTExof=K(wK;RUTMCZvTeZXJ@7WH}|88ZrFkLK~jlpR~N-#=iA+#y?of{ z_(PLe&L!e!hsmy+Q4z0FzMJTnhS!dU-6&%4pry6`5?TCS-qM;lLfJDq`1C?saY9@fUAdKJQl3Gf_BE__oiSX}d&ApU0!&mACCS?_^g!rOA_&iM-!< z;G$f}j))m3X7B9dYS@nZVOnjsOB35Wggi>56;)O4fPM8@#cl-A-px-M=9PEVJ-TA} zrFC_fKB33%h36}{?B_2S70cNO%W6Hu(r1tlE8;RTSvus|k)$p5Vw~)MaACY`^)=E4 zTG%3e&tNIyM(|FBDYBk~Ow8?Dy=olEUS*Q9`r8N%oBpY(ZdJwI=Bhwz@o}5iVq^Qw zDz2nEPs{7cIU;coUdYBB{&3`Qjp33CupTlLMqAc5VDIp!LXa*s(Uu=RC`$5O_(&>C z@+sh&vxa$0xAUGNj!OL#sDE8y*Ct7m7EQX6Qzd#b-cFTHqeLK`w>Ds=YB-U8J9x5S zg<$W%=XUjb$$TiP}~b<6v0S$#QSIy$UA|4 zu%cB9+*tp4?%Ki?xWXqGUDs38#{BmEf<}rN*j>f&YFAd~bKf)_hFZQad261Ycm|*-XonA-0Zy9;dz*9*3u3a zVorJV5>@_H;TY{jspi^FCj!5xRgW=SM21myomYaQD2{3WZ8q3I3F|CtER})JSI5_~ zYVXrE#~b5EQhIB4CL3}eaM4ppQivUG=}m;AmgGFx@Z7oYLxw)>h;iMzpk=_@PrZ`y zb6>*sNP)srR}mF$wVekPWMg}=BhLzV#HGF38sS(eQRC|0_6~Q6dqTSe=wZ$5r_HPC zLj4}3-zt$5pz4yb8ZD21$z7+x(pYqi?!jhzvLnvER4N>asgsqJ#HRRO zs120wQhiLPi&iDFBduhk-Maqdu=K8JawFIVWhD!teBhEEuu$|PKpM~qSuMdSzk zSPNH2;IX62UKHTfCzac$D%H?Vif*_x<(Z*Y|Lp9gjP6{mnQOPj)3ZJOFv5o<4yh7iK0Q*mNv2)KF=%qujxh5N7<@iWBarYW#Yh@;>4e9v9t^s zW%1{}dtvY@q|Zm_?%=ZjgAd%OR#Hr9UL;y{a;(%Tw$iGtm{i?Z+^Ksff(N@ElWQm2 zous;z@5VwWR!6)g7}T|cj+{?3@I1%_h1n?T_pgnQk6~v1;CIYedghae|E`oJMtSmh zu7PFD5IE`7!R;dce zHeW?5K_07`dGy|niZWGf#R_Ehl*T2S8qK3)S8K$ z5?(bw?ABYY0_Vq&bGw(`=p#{n-tiZdk7mu4CZ}*2skR>2rTM{F2|{@_rFnwi7R`yT z8+x838EyCW?=Rppai4pWEWf#%UAU8R*>s1Aw#_d8tEu*VQI0~XP0PF2D;8#bf}SDa zqX`Nlu%Tx3wwKyM8q zv*l~-ZV6}ATlbi2Io1y@hz%4o!R7A|XDWKz`wCmjYTA;kUS{{!<_Po44=C{S?@?=E4l$x}qh*_MTL5*zjCt?LZi1+sHd#EldQY}9XjRilpva1^!$ z!U!$1@o%;zz1FJPpB(Eil_^IR1n=$afAJNVRN);{{$x5ZfohHV{D50D^@&F zn;Y_(wGs05k2QCuC-a&oM!mb2)zJ~kaz5&R2%ovd=BJt@AnI~xRHx4$O7U2H^;qk0 zA03wfnRla?TmekDGFDC>*WyGQDN)vf;S(bM&A4-Tw0 z=Xd2?+fQDKm?8^gxNh%cc~bdsT#`54;M*J+p$c)inIG@Izm1_3v$hpObLq1?M}!{r z0EK?KZNzFTCO0GFZONSs28?S{xZ$&p<`F2{m}rw|-mE|Zb+l@1FNmUZ%|kvMK2Xzo zc2q%8v8<2X+uxToy!;YDeCpM2s=y}EGyI+q5qLJo+Wc6~Hxh2p<79tV zKiT~x{+9nGL*h>8^_!Ajs$ThhOOF&v0!iyBd#0XJle^j*Jy!`vlvWsYiTlAx{$rR* z>Sh1k+Z&3XDoQke93#8RkI!b;X0C_$A5rR3wa0Bf+iA0tp5&T7d~LsDr<$YE7Ns{dAV461_yui@?58y;h*ofp_~03_B1Q{ms;ssxl!I1D zyK9ra1p6#}jp%r^^hY%1%*N8vq4wLiHIacoMtN7)&igTb5Ok6wKi~ZE&Y(nrxO=Z2 z7Qj3ECWGCV6px&48#Cxdt*n2<^wblnaNQAHW-;Oy-RG=0-W%hbql#pZ-uQ&*wU@0f zu_s6J!nr0P2|qHG?(j+0?)kp!e(o@r{?Ub3Z*9C>T(d%uw#j$WGc%(6ZnIV+f*o}a z!BEO|F+7WxZ~o>bsH@ln^q*R9M(R3bgXBRr4tsM5{p zUC@RuaZaSty*t>n+-a;Is`(o;6%+-(nB;SuY8?6gZOjjU)9=B^#e{Fn{SCAyPJ<=Q zSbkn{EN4%1l!jN;X&$BJJigZBuTY>z(n`g5CNVv_T=u3p#q+*aKe6?iFx~h^H+`(N zV8hAOs$-o3T7C!A(zun=hfQ{^K}Soif>m@Oxkj4S^a1rBzv4}a=qN|0tf;)k9WGeN znzp&^^)__4;br__(=FWmOT`Z-wt-__ryj-}# zMtB!!q&I>^4UH0amXX_M4I-Yr#Js&xT25jo_$l6}T%dXUIhUK^h>{`x1ADBr+S2BP zf$N8+uXq)y?|ceZa-Xv%N4DHCc$q4PTa^%Ru;kN-3DreC zM4rogCTW;+nn*&nOw_CxxO0<7IXOFoTwti|=Y~-g_p=)^il?Ddko*tT|P3^vCkIGXe>3^3h`Y447z;)8Bvb!x_fb zX#m&1r}pt?%z6Luxms)A0au-kt+C=3=S{Sp`{+ilM#@Bvqt_p3I-V3@aZ`oP^goQM zjMCp#j@T?t!&E+{4zLs^wZGH$$%6iV#{@^6eUPb09A9yQonNwEl7Dg{yA+6N z=^NdE_EBN-0N1VI+Jid$?fJ%RVx6+_uc%g^p!lxXWJct&WLf&}Y(t^`cV8C2%aN&B zjzUM&5@i@~#(b_!AFLSVd+$YN`hsFH38Rs=z0b89*JVYS}WqW^rAY&*;Esl`0VrDoRr^MGkrZ) ziEGEF{#B8K-`lU<@0|DiG-Pxn>zg!!nma?hJXP@|M2Y!3BHD86Xhk=8^5fA`XU?eB zvJ%}kkzEF})QNm%2rpf_ofi44c+9G&vybs7W*w3i&d;HEYUS3YTZY8VXjljii{`Hw z>>3g32IBgA_H}F`SQXAZ%_mJt=NJ&9?-V9kleQmf3CK-VGS|E1Cam>5X?`aW)+ZSB zvhU513KNk;)Jymea1*pcD-2ZjpGBkH*Le1rF4b|FX;ISVowF4CZrKKQc%1cyTOx@9 zQ;U+r#(KU|pxAy)0}W>$J!_`zDqJ#B7+-QlmxS<4^9ALkfg*v;9g59vamDLGJS^B+ zUq22~Pc0L;8N*~fJXCZgZ_AYXjJC9+zx^f_TQ_U$&`Z;?yb_%1#XEm+QoiM%vef?PFnUzQNLG^A68=hdZ}2BCYF)ql>V-P`{9H3q&+cZ<7ZWFUB!qrDC%!(a{sT&8vHykJTLUB?xCN zJAdehd>IaEZ8G#UA~v2WYL#vbe`@Yj&?I`N&q=x=>N0oSfBoG`*^BLpqOT`m-GyHU zmk)ZWF7ve?mtYr?Fwt-AuJ;#7M?B7%G@U~`Ai|Mm~wD7vr+BZ9h*)_@5^!l zdlB#h`E*_8%@(s%A}gM+xO9~G{PEAqL;XO)7Dlp+{Y_7gEhpy zx`BgQFYQBU@1YVMzP_4D|> zSEx|;H-BJUeQa%5N9q3+(eK3&1+ybImH-YSWL_OI=k83r`KU=@z`JJXBk3^P%^obZ zn*`kRzG|6cjc(!v-tbJxS8vn6`+Q+D66C>S9kJ3Iu6kO{+{3y2jeg-(=2)(S`%>lp z+F2F8zNCBKdP5Dz?F>ayU$tdtey@ziX~)?;N4CE&3y~J)XnP;I^qpFN{9tM{zWCYb zGLalT!Y%JMaWnyqod&j<^5&Pk6sH8M2YhDJa@-*bqZ-7GFCF_a_diR2$q_WV=95r5 zen-@xtb^F+k=5L0#%P%c!9bfU^~sB4hGv5SWr^zh`S$iwO_^Q04FSUHC!Knm&*In3 z_>Y7gH+%?tF0pROlRuVU7-=9YvE1xyZl6E=YJVr*cvgeW`{XF z(RU2Gx{Lfe{8nDvaInE*S6ZVea&l)6YOSLG#dAC zF!zIYH&e8Eh#MZ6e%PC!KqVg{Q>2@`>b0^c!3BFlTz>I08XZ}WZ-?PYi3S~b7-2f! z_}=ZEdud!D5jEfOK+}9RrUa89P5prXtuWFMLK&HmAlkPv6q^xTEsoK00ngW&WD(d6 z`*GRY=$3mDt!aDy7k86FP+<<{SX7GscU4*vC`m4>3ap=|=JIS~r+B@xqvQ^Gm>|`0 z&SyO6*dt)9u1qvQ`hKOR%G-o3MW2sk$HEIUDkva4o=%{DVy!}`>+j2yQ zjY}Q<2bYmMc<1s1@IlJvAwjV`ao(w^hP6BlE@BH)T=LNxIz!8axnefM7iM;a4RIHb zsQX2k3fx|ENRrj;-piWu3%nx`EAqUBcL<5r83p{;hswnIfa4VSw^D?IKt!f0uX=Yn zsYTB5{Y^yDC*sP8EmDyyX}Q$Ar8~1fZe|>%)Om3FJ?)H3tt~}!Rz=WV43a{96#bU> zbo3_0HL(%}A(P6AhAli93?=HtsWwJcbsSV3)E6#DGTJ$Jb7h=UowEo+1IJ092bd(_ z-X!cF4wz*%zEz;2kD5ah`|K-JlbXA=fZtxt=9;_1iI8J(fS{@Hxa+aU$ZW95Q0)B# zS;xo=>~~cT;^dY&F+62J_KibP@!ri*7SfN?Qtr**RYw0vIyW&h7=F zCyR)vojo5}n-d`N6>M;b_O5s{fo{2gF(?0BCXH2tv@7fcK zYqvrrMQe7O(@X9&q`j?DiM$rYcPgfi=yV{Eh_;bmDqfEfvVXaF$PyCsm~SAXwz+QM zu+ngu8f|h=#`L3|?U~G3H~yoTj3SDN9^Ui_wQzMN39^M6btL!hOe(dgb4pyyR7;YP z*ZkB+>T7gKrSN5VM27&31gqUb%Eb7oFL1BC08X)*TcwiccronPpB`Dw^FYdDZB03GHGFe zlYOt9cr)FY?hPcYoUZFVT^Lo1wCDWcm)VDE-ebhlyg9pkgUIEh9(XkS@7xx4o#)7A z#~38))!)C}-=buFn94_JZF2X z-dCGC*$Nb`HmSc`$#hBL{N^SXf=Vt#N1w9MIHj|xzbqMYV!OW^u=4@a&GjhEFUihM z`TA|w9C}N#HZHF$+dwnXSpK1NttPCuQCjY|^R?Sn!x$C^p1523_cGdI27SI(sU57* zW{{0Tq^*%x&Yzo$zv+oP2dBiLw9{Ba&uUyLa#&U?I&xk)r!yQ9(OEY2$IbS7fG%5K zF8D4St&f9&qDWEIh11-L47bnSL2o8gCW3e*8LcpfhruJx`_9shEu*7vTkIqWw@nI@ zTg_?LGe5mt$sMptdz z+4BA9yYF1&+7OvDV)(aLA(z&1)_kHXI0-K3>k-(-H_QWG|m-dJDFYjpZ<$R~(* zZnT6Bsv&aErfaKXR8vhA-rQY*svQ)jB|Scl7BL{+>CQ3j z&Q6gv)&VIk84;*!>+dg>3Hn&vm`_%GVtts%n*YASOe?+Vrc%h4;LK&Jv+qY@NXmOu z@j@AhjKUqMZDb=u0W+*IXad&@81-Y?9!C>I`elW>sc6U8dwQ6k)9Q z=|>{Ev=?bOxBkAI+{fX`g72EZ!cIhWi@5w#!dafvofOJF_2MkrR$X3>}gvDRofd1OFN2=da7-(8$qY zeGjEVv_uN#KaXteX>;}nI-TxUF84berk2C)nk^ujH$8InYloph^6%YIP#Y_bTAxg4 z&h5RX)euYP9DiVEB+k&^kn`a%p=D!c7`v87sc62{Ps`>VzH798a5~%_Z(eDr{oD(o z=Kgk>*$;8o9>n)c1GC&yp=76Bfz^ZsVOr$^!jI1OpQbzvRafs!R7srFc9hfT(d$^Y zX0wg%${ld?e`@x}V^t67y0e`UR;tkG(*pIso?9@weq;C~`kix#wYk3; z`K!+vsXR3}=5HTm#ys^=g)!=3h~wg;$f?=0sT;q^Yx5tvomww5ixk$ptgE+o+m=T=e;D(bb8tb8 zSf~mXT?8w}567|$lf7JqI~;2~rsi6{rVZ)XWf^-a~kWil5f92@*WY$n$iNkj)x$zzY z&AN@A6oqHVMfxko;L-ec*kj97r~GKkkHXjo(LOO*$_DSJ-1THe^|UWe%_7cQ=nFSC zQ+ z|Kys`+f(my2-8|c7b6Pa7Si!aI-bX7Orc*}^||RpS{=z;S#l6PRlm~4vDl7ewt`uh z&iaWqKW#AgS|8qvh`~%MI$7_VtvhSDnp!5W(AIbg2rb*T>iT8vA2-3gr3CJx+0c1N zq;H?CK@n6JTSm85Z%iFo$u`unu{(uqVmLI#U)mrVllM}@w|Qnxe_VGgM<5w8o~8^?^S z9QQl7?%%7)98`=+Db}YLXEkHUY=9C0_z3E7iZDRh-VQZap)v6Ia06JLOITdwj%}y&AJXkCUo6!wn`r zxW_iflgc8Y!Q6+J5CJ{w6#ij6V3OV{=lj9gFNQ_lK3C8^!48k67%;r{Osd?x?5uK)hPoGu+fi@fo(( zAt+_-gowbzYH9uXVrS)(2}UJzoh{B|PNF`l`^GTgu~Wh#{d3n&m7Ujpt}8=dW(buJ z4-fg8Up+&)FVm@wW#80RtBRoMG_O0ovASTm`h5M~Xt;GFsoRjR@uJ54LIpa&032bMA%NIzEb(SFilI)ic8L|O|K zKaxZ^M$R7=jZBM^;5yB4M6W3(q)JD_t-&>d-|92NF#ZJDlHGSZ*W25qtYv&siBm9%@ep>HP?^T7j-;_%(I&)uK&l7I~Zax7v5GhUpqCcj@=j zxb001ESam)C}G?0QI&N?%HOm7cQkKXP)g3Ol`|sjkIc#$J3J$PoU1fZiHN& zE5IC3RwK9(|A5Sxp$M(SPSL-+`{ux6Ex8jJIp))E>i!pf>axYB=j=Zd$7bm zJbm(1-_&C7Za%$_&@2A^#}-09pD0Y-L(+T+**2EXrMMKAuK3 zj@&ljULa|&(2(}+U$RfK@m9xSS{o`|UDUOg`ON97d$ZrwV`_^G(_gHO*_Z<5$?5AX zAIf4;V@}1G@iAe78Hw5(o3QcFfWupg65Fo=WpO>$8}RsKX7hEKD9#nRNw6HyPzy<{ z=Ggv?e?4_rJH_p>h94`kNi7yL+a^jH~N) z!b*eN^1hAY5Qdo#+H0o%hvaGd<~B=4rD;1fHdaV{To&?aY!T#ltfTI$GU|=3_yiA0AA^z8-_)h=v zor>&!(oZi9bNrY4^eMOdJvZpOX!ZOE^OKznD>8KUO4c2)CteBLwhg_s+AsW+aw9po z*@h2xpk6bdrMXBV-0nlg4I9>R!DYme>ZD19SW~gz`%@#+F-8rK3_g{ay*BBRd2PoZ z-anye0?Us`+^iC^4dc+bC_m)D$2L@o=0IU;tnB+18#RCc5qf`Uxp3&Q(IDUH^c8VH zYMkd9xaY}UOJ<8^6cX`;wl=XPSkyOXN3w~G?fEkg%WoI>X~7t7RS-$D=5$nZ6vHsv%S%$5SdrVUbjYgC~$N&sPKLHFq#MTRGfxMj8l(b z3r*9~5kh&r?M7JKm@uIgayn9Eh{ zs8?cqEL$&2ba}y^E)s3MdYEHM)H_2+2=7TZ&{rMFjAu!Urxr>T?9JmJCN;IE<{p|D zaSboznV-`wi}(3B`k=GPEAIJT+C5eE7d!f7(Aw~WswMNa`xHFSXlYNSj?#O}R8&#@ zJ24gtSo9}?xVg)EjaKgpm)pEj&%6E5uVi;Y`AeYtgeS^!$0X85;KiLgar%Dv-(6mk zMtk_q(m$|%L)-Z#53VC86MU1q3~5Rd1vT|u>*Dbl=GhNzYcJxjj7s+xc1pN@xJe+6 zeVo0)SGxX009)x~tR;$~o$xH(`Ze!Pv)wl>okpE)EXAVNeJ^qs<;W9aB~K5KlT@=P z(CrpN2od({NgLmk>?yxjRT7<c1w7{~VE&dVsxtwWK&xE0BTKbI z*`*%Pl;4XarHEVYPN!+(knvx((o}U)X6ux}{L-y8MJNdWN?HPi&m(C$t#ARV&JwxV znG#4AY%vAgp{R_TOUF7Pc}g)i+lo*!AQ#W7csbrP>aMGfsa0;@sMA}YT+pdV;=2c| z>+Ej^4&ZFdHezD+={2K*xqzuGU>6u9S9OX-`UM)oZhOB~M);a7r69?$y)!*a5mA%7 z|HY!QMCpf;V?EbUFXl>d+z$u8bAW|Qa7;0HpvA8-4cLOjsPVY9Ab_FE1j!*V6?TX@SKaqP zVY3>*S=5m$tjnMp$p>92(ia+|o06Ao>tgeCbE1^oa9QwFd^3sX2$zVE{vAf}O$W53RthmdY3e@tODNG8( zzFwey2fDCMFbZ~bEZd%jEcO%jz|;=SCk&+fRcii9hTlzk`A6?Fw;46wu5r{_{WJ-4 zyx0LM0dO~Mquv(t1PedIIxS~bc0RhsqHxm@ads0ksB2#o@)UN(HM|v7X5J-i&uCpt zn+x$6?ROP~xJdnAc77}HI<#-g#JylcwTW7lGFU>Oift+Nftmo%Bqj&aEri>-KA~ku z#u&tE+JkP`Ff2Y4egR!5a-GbeGg4;|o5T#l|JF;oKQ2_RUkoS^y^tB&(V|N+w_cue zBM`b0V39OqQq|Hx%!gf)NQC-ghyAOaI22(w*FLZc!R(V`$b&NZ+}E|O>Q;t5$3sWf zpFV@p!6BgL|J?|BMuQKziN<`uI3KkHrI{yg_Wkt7K3kfy8(3SX>PWGz8foxDbaTXq zImxP#r=AA485P6P*zDwbpoH%~wFz9Sn!F*iPa!Sc)a?GD;X@29bD#6k2X1y+f^6NB z3+KvfD8>j}EwWz`EvTWl_^fOMV(Y;?%p zly%%tANW0El!dsj1d=LSF8rtr21}zmU?HJ+#e*2oQjdDkso?yp-m+;H1auWB%K3Iy zge@V)(JRb(F*SonY6rc%Jb0`RoN@x;Ep$~qcPLR>MU?@fyq^bJY2LnxdHI7 z*nU_@kh|HiwQ=Ki3JftwbpMuW%6Qqa-lG*W}mOPnJ30D z@jr-IlEl1aE6;x6{*jJ-vkmf_VYj&(0sU8EdtM7h|BP-??ee_2#G{^HD1wn!6~Ru(HZNqMCXr z3SA$>)wFlNFzt{y5v_SW4vuKsNO-TlmkT0!@UocIBAzRI>t|Q~&P|n}Q?cmKVX{IrRRE&k(eUuf=r=sdVHcNj z(qA-}b`aZ2XGmc~NEUD435adTv)4u%d_zTD{Y}<6TVbeZHj6ez9=5%$iQWg0x|UU~ zEfL7>G=I07wgD0%&HGI=IfWeV5joP`WYUSLaU+Df@4}y}2B!;|)Y>M_X4T-6(6f{F$<*VBQ>k3Yn0+b}39Dt!5L6?8FKF29nWs z1-T;l&I&a{^UA#Lem$dBdiRFb3+S zjG$mzW;_{HYx~1I+g|-MhD<lF($>Ak~QvS2w? z;qlGoC%cS&I}(@`%Q!IAnlk9xrwc zGfL20tC?R#=VQTYsU$Qd0M6k+k8@sSCv0{1ceitd@kwqG7|(-buI~l4pdEdGMfBni z&Lx}@Y=+GolG+1*cxj?pb7IiU_e6GRg^f;RP*Uz(fP*kD`1pFrh7UQyi7|Djc(poD zcC4MDsP{Ey{NokCH6ddh-lz)IIk%A-d0&{cG}obstl#G&p7Obl6e8KWO(L{lgvT!S zc#bI^;aA-KHL^r~Z;KYw9d)M^=knc<8W1Fx0|xp;#Bu^ZOru;WHzh3;zz=++@$SXB zM03btsDRg050rY7q|wodqx(y{_Cm~8`^JB?_f*md{*Di=h_=^#CCAAxj6lznB*&d3 zSPNH827SMIfFhRx=D(mf|M$8)zOfUxOnZd5@UQ9_mfepv>itrSjv}>q`3QNx;$@)Z z7c7HQ&d>YHYuQmws5Z!elQKqBSv}PxCyd-Y3x?#B!~9k2Z#`%wtJAL{bS}2t$#9M; z>VB7-!8e(yn9ToX`W-=I!X@c+Li|;t0GRvLORq(M*2uX}YDYjJDs)#5#c1llW|S%@)dCdx++oQy*q6#ot@KB zC!x3*yQQX0&zPlZOP_P*7Prm~Q50DLa{P|N@|LAwTWo9<0%b2$B$TYRgh3d|du0}P z>Kvi~P1F)$b$OeHV-1hb-WT& zijSl{Mm0u6v)Yg2d-91$>M|gU)8%Dh3p$=B&!$nEybSx-ZG72M+BkKWj7yIahgA*u zDQC9#ZT+1&^17lL=4@XkJ*aJD_?KhhKf9U(|(3N6wiNqnNtCsK3Dfb;z@XX&J zq6*+!gc=k-TmpXHjnR>-{DbGN@;#pPQaw`+4f&DG!(9yZ+}qQZjldN;kre)bBgI)- zFooMD$r(`@ULde5uFbtPpgb`HL1Bgnhh6Q2C?LIhE_r))^d|Q-n^uME8S|o_JZ4jG z;~5j!(w&w6Uo-0z5vdw{FA-xv*xyo)Jwx+)On|boPr#P7rh?nI8(%diFpDh9A!7)v zp4=4fWQ-3&H%tL+Jj0o%VF`{9ir>)b(UmG;__3*r9NR>w#cL8bAYG?dC9 zTrHPZfC0G1a|mM4J}Av zEk5_~2@0E&RKb9GZ1$FBk!vOUq5X5Q~KF!29syvBQoQ9y1+`n5kI$In%ST{$iI zSaoUvpgJypre?rIAmDT`mEt2)!L7 zJ7BaA9ieePJ~>zs$!P|)H}rn94Q~@5#P9uU&4eSn z7dS@axV4pzAefuiOFjSN7;g~m$5u-)CEDW+-O8~7W6VjC>j?@Heef(iIIfGwEdg@x z76L*q|Nd#K9*hx9d-8x?uYY)q2X#oO404Edpwx-)+hzz!FO9P+p(Sp9239N}NtBT@ zSkG!)Xtv96PPKAAfpZ~Ep-2XprM6=$Rz`F?@4?8-tt#*D@}d3ZCGhLw&|Kc;k3)AD zX3C;SER*j=ZD)^~=Es}ZWho$+9IbPOox_hzH0Pj$-JdmdPfJIRw`VOEKrKkMGSiTd z_@@@sg;BEWr!=Eh?P9&JIuXj^Y8W zG=7FopoNEJV6N9*f7ki|hFvRw)spvoN(Zy2sF1zGgEE4E$P3vKf-%8#GU;w{1eDy}4 z6Tfb#IL}M$EFUybl{?Bc(TB7V@1Y5T`);ZTjfIkOmXfE3UbM&Ylc!HJGtezShoc6; zJ>8db)~-S8VTq~c8-;+@7!0oCZDFd;jBez#Bf)}x9dP5xj{ybHcdW^mqHpDhhzFYg z=|kUSl#)+U7me@vz#C+fK(z8@4f{}8|C=+^#X=h50DpA2y&JZlKAAg)0tL!pqXYw= zOl@2J4fZAHfpC84zd7L!v;40CUaqX+$VvJXl(G{iJDBdG5MpS86jk19`gT0N*IAKI znSheQhiiDr)K9d{T4}2Y0Do=X`)9> z2ka7QVrTjF(HjVemYiMfom#woReoauAM54JWM5k_6a(>a*x`tgylk<@lg z$BtR|D}&+fr5B}0_6C;M%f8;zOJI9l&hh8&&-ob>BU1L!?}9OHc+&xNC!H@62aVk& zMTwXiOf`ZLtSr9ygMg|(PqwF$ewIb9aGNeh_ZbM);!%$qRH`D>QgczN=tfwE9M~$lyB4D(?_Bz0xl-8=} z&2i0i0OS(4GccN=gw^xm67g1c=&5%5`uPXO(d!OSr-mrk9c(!3@?P6Lg{(ZinRHh$-YRs#CZHCX39rlbp9 zAp=fkJSKZ)2I)jLND#C-M?ZoD_}b(rcW@I$TK_griHeJ~BH^%rIFOmTPGeQpAoeVj zQR;h%!`mOMyBW^b$173iB3SF#mg|z6;4n$&xPft0RRRlAo9l4Rn!K}hU2p=Oc8glH zvVnd+;o`J1#C>zCuxB%tUI)3Wy_Jin5dqSx3C0ggUT_IIraUs^CQbl6K|1tV)|9}O zISTdznL-`EcV^4wYh`4jHtDf_HAt9TPvLcaIa;O*XPhGK*OmQW5AMz}*(#=Gm09?$X!g*r|$$*CE`Z zH?|-^LV#21*n=rnXA9}IW)49bDr@BhwnxzQ20^$KK%P8YEdcFhLV_ph@xyGO6S!%% zm>I|Qf}d2)b19v;hLKNMVoYf^YCR&980seFKTOweL8FPy(Fb7};FLsx{tC(5+0tps zgBS1T3x2l6C*qO{$nB%spO4y&L{j?`R#de7FMR~Vn+V?a1Z1~8oP3D->dToQ9c5G{ zUBtW8#M}d|Ph^cHKvQnh)8(KS?PV7RKUeDS>R6E^+<9M!=~>+R%8J!chh#eyZWor* zgTpoXbj9OTB2-!r)M0n+8-FW?n5C_lgpN;|3B8&*i%FWEx3G}=Kd=5L{c34`eG%c% z6ie)~>kKmbjSAEMzo5mx~BOfk*= zs*49pUdDId+v*GR?vC4Sp}52UI=~i!V&0KfgVL?VTRQYce5OiOi zyLwXl4dLTMvKrg@vt`55Qo)){GGx#`%jMk=%4lM{U#Lsm8`R{$%#5;Ij2;BZ){Spl zEyOd3L>7SYp;Qw}ZuhAVISFJnIo#X9G~AUXTZzpbze0~YS|}%`>~@+$%6_ag%Duab zE!8FyRqx4WFK}K_=uLpF)+G0jls04@BM zBFQl~qq1Qil2+bY;46x}VBLgzOPefwgmTdKuBsidLdCZU$L)_1;vG8x)(pRH32$(w z^H}anL6$vZDD_MV5fI|nQJex^J-blg)0&qD&OpBGbl2K4npeEVlA7#IZg2-N&G$Z- zM?^eC1R5-(sIN?2Z^0l_SwqZm17%?+RvSWisdhY!^m77H)9kcMfDki$9U(vqfrj*Y za({&S^Qv=gD(k^94!@9?*PTZF`+mLTJ(m^~yzJ|pL3d@ZWy;ri3LAEc3$&&4NxF!G zK1{o(VgbVd7yN_ibcH4I74|cVrDjJuV)T<@YS-!L^>+0c!sz@XUau1+@eoH~VJE@z z=^;`BcGgg{!WTx?OJU7R%5_kSq~qU3q{*2s^2XwXP4*QdBa7={r>v_uzSmaCqDyOk zr(^-`>?OO>qPT1Jv7`~rrN#SLB3llcQu4ZGyDg7DUxjMqIU9&InKKYx9ev6_5ttkP zz|<`gLLn%Bw22`j{wo+bnNMhMU0 zjv;*%gV|cgU?JZ%^vg|$I)+9Z2GW8gb+3I9Nsi14#hN3vW_N8_K``73LoM;{Ts(LpLrzHzs{B=$ z66tB59HB+E?b@*h8FI7i2x_kJ%=+EqRu1H=6dzotMr^$BZPim#7R0TXW4$;sdQ+L2OM)K&00GHCnj&m}009I0IhDh- zcA%+~#*VF|W{CI+LKS7v;pa%j%1K0(MEC5qBX4o(v@PY7wc?QAGO{ywkWVCzzxVN! z?LFFruDm%&4FattW1F_MAD;Aiz`06Cs8| z(OcXbAY!Ekv*niBC-wEM2mD7C@AFL41LV>RFEfHGkzuD{ALM%>Z*-cNFgZN&=FMYE zdO7YDf-S16T{6$JL{4&KyKd8iz^N&|=!na`hdAuC3S0y!YgQ%6z~(TNhc+36-_m0W zh|%AS|Be`-u?WMURQ}%j-w^&vXlV$Oe^G$Xu3h4gsz`15ikMw5xviusygr&S@p~Z@ zHW=A|b3{Uk-%NM0R$l?l!UekcQ=!Xl_`SSDQ@NdSx?+b1lQo+Z7i)M;@vg$@{`iDxd(BYW2H)St#O#*f)Sl5e!E?Vz6WqZ!UzlxKDH0!n(odz1?RO0Jv@;zh$}umDEIbD*wmw(dCT;;pYL2m?9m_qb~nu8PvPk3+nG@CcIJ;NZ7O@m;I;R)&~EuDfB#%?FmRt`vb{Vq@4h{A9sPLZ!&Wqe z`!37=x}nXNe#c)%==pl<3JMDO{-l(yh(f37b@1t&Pyhe`L_wZpY9S8+)k`FCvWnF^ zib~}B(pce2e;wqK+oKZIRE5S*Je2A*JrnYdM?5w?qa!JbEsBfv;OmQ9i8$7rfWI%@_+J++*NxH5P;yh zj;ISWd*kTMK7|An@nwL9{LxJR+L5P_vT8%GI6tqL02xsL002b+o`r5gp8^eT*=U3I z)}JES@_2Z_I(V28UDg$5r@7&HoL77?N~KBnNSp-eZvx*7Ev zAaaz$Qz6t6vr3aPAil6UN8CpzEcus00+$7cAU*LlTbM+@OUKn-Ufs#Gl3V%!9@hDxLK*KCCMbqUZ1YrOG09pZ_ zr*1-@0s7^6Il{uCOV?_Ax1Y-0AVzSVjmV&h+iqbBCg5mi`9AL6q#mzaBA>5%u}fxl zk(_3(N+iHoYwq=#O`o~lzj|yh#i;{QZ#|s~=>w(zAb0rR`T*H_8Q~^Z?q}d7!_9??Cz=RAXrqzL;NXv zEdT&eXyx9Z$(?^=%wl>izp5gA2F?7I4dYF^jAW01zhtVJZD>S)j5}k+e@w~a&<%4+ zs2CMPmZ~nVWvYD5Xph1t#g&5YdD|*0xq6$c z=-g99{+CWb!9qvQt`9{b2wY^`pYbymR>TH!o9X)Zs7a^~lQ;(xhZ^d?ls z!7040nu;3oVj9&Fa!$Hh2>!}v9LcuizU!Afc_D-})B3*SyrneA_#AAW;+V46ntk7u zFt$1vKis^-=uG{>18`!DOb=-gn9h_$_g#=nTs1cv_Keaq1lBLGn=uGy>dhwji_6*< zB=(*6D>3kc`$6&c)VWx#*5bt0M7mfp>b*&8`Y6nGpD@SvQ~+BT3D=GS>1UO-3w0D_ zJKzu!9f_uW$S<`F-9)J$M9J>+2HM4;fW>JoqPt^jy{oB-txuB>bij3RzGR8|sMXi1 z#sAnq|KsUS_L(I;fvbm+g1(XjTpd|UWhZF6FYE!xeu9WnDVc|qf}fpY8i;>~3}d{_ z4jA~oR5@eiQm@$T5Kcj-+8E$|4wV=nM2kqh?Bb+J{Z^Z)r)8lFyPlo z+22)pwm3aQE&u^xw;SS`fS?y;Dz0t;U#4GL-&Ylj8k~~s%(H6auaC;hH&NoH-xmHX zqp_eAstVcRc*QS05d^-^H#kq=jYT58d)P>ZaBz3dXJ&>o#E|DTGuHqB04M>T$!GugVMy!I`x`_6r>1O4aggMF(#(w^LoFXo+b9un?ME+Ji zOe~$9{^pTj`IS^6UtU7PB@Jy_Ro^uM`{&Og=W*2o4ZK3ciH>UPpg9CW!8LhsT6eht zH(zr}{ny+CPk}D|#4q@vV+ruz$BVV4Fyjh8gF{C=qG2kTA?|u-p2HMA6cDXE%C3tE zq}2k}gYgRMN+CWB0;jgbjmu~>496Gk{*n1JQz&jpF~84qsX0}8Xw_9l44bn}YIehm z1F6DZ(N#ntu4(9oz?ikFpq7FCO-h-bCM%}@`M?qD zqQ%KJcpr%Qm+)CULB+!j60d{kZ+WAIZqe;e{WG3MsFrL1PFeKH%&(gh-Z|1%E?^kV zk}{1RZrynG08=(^$=%Z+yqwF>OXYkk0^3^QJEZYzvX^&1le@51`2f5EiU+DBVKQ-n z<1c&@fc?-=_bNqJe1Ik6NjqEF5To;k4V|qQ;2 zo4^1WlQRzr{=qfqvF&0xqJ};PYIHIj;$_hbmzfD~T*rd>6YuF^PT|T!S)q@R;W@4{ zWe6o-oW$ZybOu>IKoV#~|Gyp0deR&!%%hS(LJf;LHTbrIL6d*sAP&>$u%B83PeP_& z(ZN9sX!Bs(L-OV#8|2bHW`eZydK4Z{tmo+s& z8#sm5&j{V&JZvx~3>QQF2BAKQzvtUdPpbM+cGG!BntoK74#c@$73OCXnnlV;@`1$$r@zG_kYPo%9tg&v3+;^;Ooxe&`T{C~q7#Ui9 z3&}ttgDY*bl~bg1m}Gj*8!sPeL6%H8g>d+}SWZtNoAH z>yo0$;VnFvgM$)Hzp~V$>5ctpxqsnUrb?-O#5}TfurwthpVJbfhIrQ1_y7O^g#n)H zZbF{{HrUTs@zyeORjdekYBX2?g5fSx0PN1U=E`_36|LU|wZ(2;O%>EC?m~6KrEEQ{ z6}Gfq|L`aBO)e!!SGs2qXY@HXL4#7LFWt|BP3N>_^?D9q*FFQ!CGm~#b z5JQ0}ju9+;xO?qw^0@A&1Ab)1v?*`^Wk3J`0^32F?mQv%Or`>h|ExHT$=1pxp^=tK zPDvW;fa%Ae{#r|!X$-39!9uWG1VTu(f?r7S`222l|K2)B)C0Cy)!P%Jwm_xFS7`>& zykj&MwtH=tYscrZUewJf{`)dAWk^wxOGDW{4Di(wAxzRv-p;~%{398o04I@*5o0)s zgrzhbAJ-VPpL--)Z0-NT_|8Fe*vxa{;kO_FL<+F71EG_Re5x!Tl#1zJE2{vw@ZvsK zR9LKBX7>y{mw>N+njry~1ZdQg@(+zD1p{e**p4pn%X&xd(cuvlA1GH>p17$^Ebjlg zvOfGPy5=@9r3G`BJ#F#Wk6YmDpc{G?C1d?ANsa>5Se#1Q$yCk66<+a>gyU z0hUQ$?|MVg4w>SJ&THOyezia)nr7B%>fv~cE1$0D+M(z`URNjoj7iwffrZmJQ6Uh( z$)3;N6uqa8g{)T!Ar*F#sn{PZ7#rrZJ**fNE%LhYxg7QuX?}X^vA+-S9Asf`pddJ^ zxyRi_;zZzbu(J$vXbq5+!7>|!C2vDv^ER61U!4;`)0pC*4 z_z?q?RMBD>lBfsOwE<>Kxuduoe(^Xfmfo>}%Xw{l3-AOx(+Bj`0K0Ggn&d>s!^n=v z$@JboTw6&Bfi1Q>r&EVAAH*pqWe>;J8ze%1?}zjOqZ#^Ewx|I+z!MvascIlC3wlXF z`Du*$pADGbPJ8H9OUSuupE^|``#6@vK>c}OC78}yrLArOCuE|H$G5!zfWyrUoO!!A z1}sv-`dXE1r14T9XK+@6((r(cpzOA~y0wCpn)`cW#sc^fSblxSLosiREQIVFeq2C?PHoL+>NL6IljaBqEg+Uz zr2lDUgz-hfU2zxL2l+DKI^dAr5EFmFon;hO?U<7%c85~t3l;=_XPnX8nCC6}pRj$A zsWTI_>r^z-BDu*roHyi+$@FHFjIodlthl3A!0D;Hk5ub=6Cn4+cU)vC`t--mY0JlVZ75sWf%1We2EXUvyOdzyC^|`jq8lmSC)R`Pd z?14NC9<^J6VdTG>>%uzo$s809t+Otvwl@F?v{LZpduFL-sOwS0&wR&GtC?i@Zy83dY+5JU$r8QyBlN%M@Jgkf9U zp`R{mpSdh51cTsZX-R{h>B|*n)`0tK=1|i&Z@Mm+vJ5yVHQ;s35m>eVx2XWx%@n6X z!P358nZLYSiQ(0P0Lv60lGud8XjpO)N6MA8XwjcqlF}IFieUofwxOy^`4A6P*|(tW zje~L6&!oGjq$w4%uIXrxx}B1~gdo7U{m}k&e=@5g1U zV&c{8vmz1m$n=_N(HEuZdR(%gyEq#zm==17PCZC;+umz&Z1C;}IXQBXhVC_;;|P$= zVlmUjU1&Z}DZo&~bf2FP=%VYgJOIFiwUIfdW>f~wcMc&nz zA>7E718WtQF9$*)$LQ7POM>wnZN&iR7e;Cmr-i-6imh@i@l{7hwR%B*23?zB_yh%> zZZjw+8Xp|!(Q0TcN$~@&_khaC;a=LzrkMT5Pk7sC_gb%&*RnD0*I`3h~QpwCVXD6;L+KgY-Fb$hX_i*o6vO5@S2 z;~klo{Aydz$OvuO`ffR{me`AIw&s1wEQFOdC=%B;NmB#V;+U|H_)JYks?_X;=*HuS z)ILHk>$T}l0g*3*z}zegdYWT1bXerK^P7qm)c-D@5WC`wGZ9Dw!XEH&7duVM0001K z0iQ2!LZ1OHwVX$_y$`XAZa!2hNtLKV`h)xUL1j;eL3vp9WCOg*q!7mC{zdx!$1Q|K zb(HH0J8)ZG+{TX8+);a%A%zcIScuq@a6?eb5jmC$U|5qrG0p^^jZ|5ndxFQBa3}zs zfB*mk3_+VRJR$iqm#DpxrI6}+v*AwPYktr2=Q}CwMq&mPC?#C&|llbFbD-V=a=S4 z@f^m|j3HU5D3qsgoi|>}8?>L6kW`_(nVbBW#7z?JHTh;e=!*i(f2bu%=`Xm*X3y=J4DR+06cwN9_mQK5=-vv~%jZAxYVf-MIH zgw6+Rdg|_%f>NoSfEMh1c`oYX+BW?jc6UbGga__5?~X~p3L!=k^au+?{eu%@ynZpr zW=A-U*9$b^R#^oF{)y1Kh{Ef#nOkL@QiBc@{VlLzEQsmT6wjHa?xR2#>m|08U=NF( zI!wNIq&HAkGIbisG$X-Ewqt6KhJ!Rikx1@zfuZ?bZiwFjd7h{b=X=lt+eM+UUknz* z8t1i|p#_26?2Me#mF)f_>8`DSi?S&3NN{U&zxlPwbc+WlRPSjhw~!LiZLu)IBu&@= z-ZQof%rvltNjuz)6A%{0x^DK``%eyEJ~CQDqAi)?-;)OQ2uTlJygRiL5?eIrv>(pn zK2jKQ!oewNF_zx2E$C^KDFhpvL_zMcD1Od$pf>~;LL%k?)N1~6R!E1bv*fnmi0*fQ znwS%ljCHRq5LpXFHQpYO4pvJrScGZrD&s`^Z05>kh#8pwj78htTLqC$lpQv9bdS9O zY&Am4$waT_1otZh6b|;VERwF6MVqr<_~;d-$Mpwf5wa9zGAQP`#5KO7P%7^el${v| z-aa@E0E;n}!V-`WX2p~u^6Su43Hqn*)ZWD<-@X^s+Xb$tWe`yv5Uh1>o@K^LbQeZ( zH&mIAJN3pINDI7A!QC@_&X_DtZa{}0~M z*#e|E{r~^~R{@_7`%wnst2SSw%=z!J&VvxM8DoJB8-pd>PMh% zs0wb0prl4bXPK^dnNST`(CmEgb^zt|00>y^(4~;@#~~R|-vC<(I@q`G@sszm+O{j` znRAFW3aA(IL;&NLF(kzT+yXj5H}{}n{Bc^(jfbG>+1W0*yILxmP9VVM zhI*d|+*5C^y{>a&2rYpD`bI7#4#T~R0=m=C-Q$BFEJoIFvLqam0)Q_wj@8hK1+hXE zwO}5h%nXA|6u=dPv0_fYW));Jr%OWrOB-vFj$LfiZ6dyjOeD{}rcc9eAOeVj^%@+E z+^{gUx3GD^$w&wXzlA5vh<+q8#5ox<-j)}@5v4F%)OnETJmQl z<(1e(PxX>zUy%`?CMucg&=ph>Q%3Rub{J?$Z>cf{_9w95Pf+_U*KLMbrzHn9R~1`9 zv14eMa_~q4fizV$P7P1+(gp=VgqRM|4ffey9upX|e>HMm4MB;xscYrKP^w^BshfpjcaxQ%sT2bnTAX)3#deN zwRp5@J1dowJF>bZ;4u~mIqkM>$rPN6o9l9%Ojh{1qm-W9s?=30IBq9WV8P>q6Pq1#wi#=lL# zhdB4K&>&(DHpx_DAQ{dOZ_}H>f&W6dNJ2|&8D&A2vJJ#wMHvBQ>2ByE9fGN{(#w&6_LOs_c7Tzy>Z!AMt<;#iawm7FjXjJ@RtUt{0FtuYc>^?>vCbQJzUB zM}BXVTf)M8be<9cBH-^!s$phVt|_)MJyhYZ))=Htn{j{JBNH51K%YKqPoDn*sGyr; zjW}31`V%tLiUXyaD#yFWjWS`6tAAB4LJNqgatR5s;lF?Qq5*knTb( zv8*tz{CRjdj0gR0-%=jSfD1GCxHh1;LGB}XF?u!N`WS1Pmo>{C;G&RG(Z4YHoY1ht z3v05~_)2u%=B`m@+7)nd2Ha()P~aLGSnKhWesX&+4Gi{Uifpd3-YIP}03mDbjJ?rF z4@~iQb(1;2I1|B(D6QswxNFEG9+79(K1YIXW}{cO zXd6#8CgoyA)2;7AN8105%}H$w@V~X%#kPT8HO9-kDpE$@c@#@)H$5~R%5LwrA>(kg z3T^E<4lE_`_W7E=>2s%1PZ`WXZ<6-kW$JRH&c?3{Gh^fpC zAX__bg0DkaUXJow>CMS`k^!Tyah}NTc<^Fa)Xjm$+&8Xt;TE^P{e>|7GdlUpI?^F7 zhWo>1z{>}4Cd>L?G8_WOL_#3*Uh-JILwRZ{ugWMXbS;5sZ$@8K_)taWx66HVwMBlAU(B*K^B z1SjbkuS z9BOZ`EsOiI2u3i5tym8AbG#Uuqo8lF*JI^yK_Qs`d>Q&fzKUNMsP3H1%AvAvhYsE4 z9n!j+WllFOI8Gj+d@kBhD4+P4SA+m>ac|RlK33c$$Gn+b{1fcB&n?t7>WfOD&3{y1 zPSBeJW}Rl158QKYn}2J-*J_I>Mfme^rg83ZxddKWIsDO2h-kaX!y%EbsicnB>?4-A zx=^GAdcR+c?4Sy6y?bXUG~Q%)3ij-fkV2nysI001ZW$dY@5r7@F#b-07RK4~O2g6% zCnj^b7=GNL(m+A=O`D`ScrHK!c6gc;d-NnZ(5*I58Z_pf`_TxFSkXmIKjjyYzUWaD z_(yEpPE=`-ROkuctfjJP>T}|jYwzCfrmIvf?-cOdu9g*HbFRw!C935`2ocGpP!35x z!1w16LYVgKvGY=VP&-h74^eVLBw7!ECP1RmJ*KdCm7%IT+@#gY{&#LcxkB8lF$~Oj zHJ`!pd@;qHRoeM^;DI@$LLM*N-e}0VKOLi)j?op}MV_Mo#;N9mY`FAOXy2lr9d^;; z{JE-Co@hb9M+{y#Y`?e{8VHS@lZ8&|f;&_#_oup5zT{4VaZONvu`{?ubkwiSKtskA zDJ2v-JU}{xAUwZ= zZjJY3S17FnkZ$CZBjITM$@H%>Jsk`P5( zwTk!dbh=gqi+|$Tb#>h87=ocxb@Vpoz|_uY_c?1H9^ZWsf_-Y)UHET(AU!F->I7^0 zU?Y*!72A;fp8%{tQ@`)49~phHLH$QB3jsx0(UrVqsiM)i^>O=D(B*lsea!A|>EiP` z;W}h4h!8Lao-8LdilO8^b~0kFHb5^_YO`7N+(K0q+^Qt$kvpHdr}clM&RzeIxBP0V z{1x>0+(Zaj4Yd^Bx%6#{HFch1E?zuDL5%2WAER-PL;txK(DF1%gO&<(@~9~{KATF_ ztYh?6#u0UFv~}FMF{A1MACdyJFtn4|&ISfIdbBpcY3y_w60$|WQlqIGHcdBT0Wz3LJd4~g zNpgcKp+lrqF;FNBIiHM)!Mg+;L_Es#_GU85!i*boRlG$HOPF-{My!&U^ z>Q)(s$#aD7eexE`p+iK9q}s?qcGb4Sh} zsbw^pK=F1;jfQeXf$QVks#~v1UOukEB*TBX`z$sJ4&>^MwI7OzIYh2|S#!TQ0rkC|x@PV8)znVlEX0Yr3i+s$-(CpLOAme!RYT3IVWrH1P+ zI9?gND`7&BGP@R6y-F+9xJuzKzo%4c8~_LsR&Ze;_EVl0YSb^(TY%iBKeDG*i6mZH zr>eQrYjmZvrNGq^l59K(Gx-)U;e3-Se+&RbZQ=P18g+9Y*Y@fpSTdC*I6c~>vd2Vn!wuo^$?H(ni__HhT*i6HQkyGRUZfSlR3aYMy^^wAJ;lBy$s8w5hAhnv zC4HR_%`{v)L;l2-9YmKeJ8Q|i>|6W3qIqzH)|Z73on76u0;~U zSNKdq*S_imj!m8)oY{AGJN`$T0srcV|C5!?P)#1@ZT^gwlv#cYDV zow#q>s%0cT24|`))#qKT;tDFX8Qn_`O+oFT6UXG=T+#Yw(UHl7Y9r$P>a{2i*3OnA zOj*+!uNZr`A1L_^Lt`7f~qY#0uCEuq(fIIp^T z8SKKjfjvL0TlGP_U~rebZvb@Gh;h?Aj;wxEFwMut63wlk039&^00f9Zo4Pz9`7)Ra zCcj)o{=`q$CjbQNX0$1>?;JvljZ?DflD^a4FajEixnQJ2tbOWui(jZO2|~FeQYaG5 zuhg@IvVFgPc~%kq^9K2*aChP|lN)n_iOqb_IUwTU-CGwGdZ^GDWqd021{M5faf?`{ zRgPtl={YN?!6#X6dYi#G8Lydi5mKAF(?UEYL*6 zBW?8wJ5z5!ks6e|crpy5+&UHy%`N`^01&T`SwSx8bj`>Y4J4#i14!$7v<`tl=aTl{ zzo?%$;>+jF3VpfIrNpr8zj;^Lt0pGM)HNS^Ma!Ekp>WBzif9g! zR(SVrZ&4D^4I*-&a%Q<7k*bJ*6Kc2csocua!JH0ZHq`AMpd|kqd=9Lqu~HOiF!t_F z8hW|8y&Y!hl1*7peh!$odHw$rQ*IG%YdZ)N@tF?X7f6T@L3|J|qs?O$x27Ra;xH7X zQ=Ss4WW=4>6FfB^My)Qk93@`@&fL@VLo%0{d4C;pGhoOi&*vll-oICK(##LNsfjPuiJcp42UO6iz z*!@)cIM>-Fqc9D93(4)Ux{zt98dWB+W~+eMqcKqVU7b$$ya^-U5r~3ra2w;Wb1fO6 zox=bs3DCkrH}z+7nRzF+ap9C15UGl7W-*>D?x!RGMNg7D!IxvhzYo8;z@0yBy9yd| z3+f6Z2K(2*iX$!l6UH4h1q`5g2D9T-$KA}*6Hdy{2qBWgUkGdB1?nZP_s_=jPNC%4 zdY4}OeBhi7piQa?`Q2ZLoX$FnrR9w{j%?NrtmQNBib;0noIofsifQxOE)Ya(A;9Sv z#6J4%mJhJdDI&C$@M;<}#fkgxlERj}vvbJd?+@KvHVnb?6nVhK2@HP7S^1ALUYBDPY);r#C z00#8{00M78o7_Ah`7)RaCcgmF##C(0$0j{)i83UZS{di;X7+0yzDy<%MjC6;pVkl3 z-oVn4U#qCcbVdRMO#nnW>dCI(tp(#xRR^Y>Beny+v~i3C8Z*ZfdfiWMaIY%=!~BP! zbY{HLCC&iA+sDcfPGO;q#ca5BUmW6>&Brv6795incg}@lS1iA8V@|KVr#bZk!y*+b zgb0DzJCANRmbwVILh7K{atbki`%{oYoN^CD`N+K-3GjIvoFtxcE|YCj9-51sy@!qy zA$2}pTUa_2!8uhBZRkH`!k?O$zrmXAkFb^U`TkG~+Tlsl;~_adA1{GV@>b?rJm@}* zuE_1MvOAm|r-Py>pc*DOMp&8G*p6oAnmbF8yckf#Y!FOU%v9NPO*=VljW~vo(YNn} z;yB1n!)M>AO_F%AGj+s`=^G9MHwu*aqkF$;$SpT!1%Sn#BLP)$wkvn=w3l|<4NTuU zwRwGwF6qYbCH2+=7BTXn{o4{b?npH@3+fatO`*j7;r4qz#&e{DkX~1eS z)kCV*#ex3j*(}`aWoqSWuosPI#pmySln9P2F2RVab}C_;UuudFhwk zv+=F{zEDaxvttPf3c5h_cInozIU}ID;qd$!coX|ho_BUJpA1Q+Z=xhgstL4p z{zzGbH;9D0~hSK!9wR>>R8c_*WIc=g94+kxc{N4 zE~#u<>f=u0jea-%m$A2++;&VPY?XKSY-4GOmvAdsd28oqPKz$K8;kFgIpSj^}^rbj*e4g0K6LxYLkL#xrOmh-J_A3yHt7 z+1KqzlkD~d9NQHm-!~}#q&uc@Mm(YK?2J^Ij}?2Ob+Q-r*|(^Q4C+^r{FQ@urOnA^ z$>l7OxV63aBV+(xU#txX0EK15VCYip8y?g6(Y<8~qbfX}s9t2v-6sm%{?_}8@e{{{ zB>z83X*xIiN)qlG3V8MHOV*D!YRo)Lfo&cp zIVnQe<51~Y13|}JYkvkWVgC*{N(_y36=CL_rA*xCUE_)(HkCc`9jcmsXQTc$DjCMj zY?!<&6voBLB1=f~i^Hy&BJlX8ax>`~p(YN=#X;>|H?(vg<(is}V@Z5l; z6sa&&E%6Uh5#x3JeN>1=u!@@qrH4q_DoXpHKTQO>(I4O*hqW|&Kf7(MZ=i~hAfS@+ z{fh8MP~6ZOo-vS3@D!O6^|io@(-H3BCf$?M`{A}=)*uQ+uSbT|iG=reQ#Vh(WV+{L zYywojyb!*6yoZkiN2Z(2X`u2ez9WNBDz6t&M1!A&mFd*0NA*f=3uKUMqmp<0>;Z&t z@RslNTZdby@04T(FdFdeEDMcFDK6FZb_1`IHFrm62&;Ylbc`=iZm(XUa!n2QEc{`bH5FFd7St{N zFC#@TE`^pmczys+&cqFMIS!*2*ZyZ2f@MO43ll6Vc*xC}7H6)5P~P?FGmo5M;Q%&5 zGuaFEERx9vkYkJJP5#w)*;*v6*YMOn#!FHQdc0w}XGv!L_fqv&8054G#VBc%n%`4? zOVfw!zJDLqA|Hd|#HEWpHCLf}9CRkbqu?hD?2suT$UUVC<(Lps*M!2yZIO-|{V+2} z6Gv$+nQm1U$8H-@q6}5i~C?jPt%f*O@{Jwd^21zJ1VwS|^zh$WZV#xnY!N|`^={2kYu?cSw`y=-Qe9R=k^{NajtE9)zgsjjdu!w zt|gb$0UtWk{E4*f`M%K4+EyJZ%p)omO~M>(WA4M!&RqPPrT+X~Z}*Kp{Z(oZLmGcGtyG zV^286nezkaz!LjxtYX#En1Xhp)?8EMjLpWBUu{g|$HPe(0H8~A`T5SOYjEP7>UkwL z1fQOmNTA3+Cfg$s;EcE?BxrsrNL})PM(G_nP zC|Y;=S$n9x8U~+Na(h7PoO6L`1M#>MgiUH~CEvl2Zo!LFnl`r|f;hdbe75{S#42+g zPhY<`#QOj`FUF6nFYBzGx?$KWD%@dYO)rFScx4afx%*9XoZ4*0#ss$bLbJ+p(qcRH zw=xW$oHzMEWQM_Ng!^R;@Qk@!7JixSZ`{2ufgr%w?8sgS^@4oTC=bbdX00JJfs6WT zAE3%))4@IX;uy+N2lqE^oo57!-csB=lzocL z>H|ih03f*eCRu7dKI38eK7%m9`XU~|7b6q*GUG2MPdsHHzhwe6f|DeF za*vlFNVkoOXoY2ZE+g|JZRLD<|B4fM=fKef1*wu}SD;-W_w3fW5DgGOR2ZjOqf8A- zpM)k?wn*!;RKG|8`9))7yQP_dYpgobR}{Bg7pROe{bgBR*s_q%S}~uR<=4bA?x-w$ z$0miKNpK!rke=65b_jSLwfxFU9&4}~rn5){bCYVZv!ODY4uK^)N3`n^3squ7Dgz`O zNZR3~k!vO4A ztg2kVqltJ2tx#`z*PLML)42PQ4VmLHaZRggXuznk45izKf8>)xUIpH;NpbY!R(veQ z6_mmki%f8)``8b>K$A^oh#+ZA+_N}z1poz+ftct-%JtPD3Mts_yxr&6;+af1sdL)p z@M3bZpS%Qy4j7Gh8fYw9*}6crs>)=P_Y8;>0%OzCy6nXc80h-BKPc=tZhtDl zcgQOZzG)AvY4O0On@@_i{S{4q1F_7o?-g}3wue0# z5|Msp*wY^$a(xTk655c)5^_VK0vU8R2xoS6G-p?QKEWd4_H`JN>hrO>iu4INRty#Y z17ljbrDgL(1izuAVkhuJ_X%vpH6K4skl<9@67;OSp`WXG4;psxALUwLCk1=-9Hh;g zS&`O}{^J{!xOon$&Jv~!q^=FY9c=s!nb6-co!&@Q*ANyU!f7z%7jk|*{%sBC>p z7#&^ZE2F{Hs;*yOMJ#d^fqX1n&-qSFdU5R&ub0o zs$)ynBa|me#94lr0004nL7pIFlvGS1{{aCGnQ|h~>vKv{+r^Vt>Nk`KCW;%%i7Th)t0PjeQwwQ7a$p~9rx$#(fVY}N8i%&`fFNhuykzeZaVdg=8^u*jD$-B7`fCJG}{ww6mrlP2FMcn;}mH12@)XGdloPKX6_*Mq{H(OG2uk zvG=tsIB}TB|{Xe}81q33e5;!ys#c;O`fr7_YT8MJzXPJ1Lo{954eTg&$jkKbategJ_ zyXdv9V9cg=2!gH4uU2(!eX3$Zct%TDt?SI2)L{|9)7S?5@yLAo9vvQ_%=cVa7oUXN(gZP5k_i&}p z4`)=`eAK9Y!MCMnAOTO%4|71jk13}eGLX;$*!_<^Gjp-85<^R!cXU&IAfC9$)Ap`K zNk~2$a#(bP9x%Dt9O_8!7gqlh;17r2zlXNJbLHRr3%~#d^Z)<=XaSx+JLt`P zWgRB%9E#ElPB))A>ZE#VNE$YHS53*}X7E50xd7nyc-?OG-C_U(KL7v+n?agIH3%(a z%3vSM=|m6!12)zIrbwqvjLx5q!>&1)Q$ibflDL4fnV1RdT3pUrN8z3_kHsJ&5#zBwo8*V|kh8muO37NT@CLZO$o)6JFwH~wTV!^pboL&Nr`%9paGpGPw=}122-2EdZ(ldb zZg*Xk4+l0p6c`o-%h%-~=8w!n=*^kkUlYW>EayLi%2NkQCfEm*Hq~lAa$r)nX7Q&cwvLJ2@ z9tYv_!Qc zcTMxM$)xm&_%g~AVk!9bE3TCOE$w9z7ZI?sJ=|G|a&h&PZwIRhmTYT}9?*ex~0zg2|ZZGl7QBoQT{DO2m?C9IDmy zVdyDVnIR(PEizF#aTE0a#l^H-)N_Hs(5*V_C6SthT|*pus!YQC~&Fa*yHzH5CZGYfRYML!a!`QVu)qmT2`=Uia)xEMd| z9WOiiJWQ}zb`S7S+96|%NcR%yRyfWd2`LLOD;t$cRf3WA)c9^ggmwUcG0?Ipp zqxh&A=P3fG_vzEAhIan|`mLfe?-Ad6@GE3;Zk?0}RpH{$3m|aXVfQ|6nfZVNvB|>E zEf9a{No+z6;FA+nKJ~6Be4eTJE~F4MJUw!z#un(=%0zpaQHT8>J2B>8HKb3Rs)Atc zL#IAoL7|0hE(AgYfF7D>w$b@t$@39$z^pAz| zmRDvMBTZT<+|JVHVZcu(#*~>Cqs-j(5xE|dgGmqb96GZ^=1(;>0b;qL&-_qX1?my- zk0CCXGnl(hx6tnm-Jag(oCc_Lc6nlfG`8Fj++z0$U0h-GI}sR=(l9J#(Ryop zNVg)FHag0LEv`jRs9TbAxxKbaUAN#DMrPm8i6Vwy8wIp&T=Rs(@`UQj18NET)FOVa z^Yq@kPcixlCbfL}2n@w6JyocTtx@BUfN-l!NkeN=I4R-9f&_7z*T&tXxKbZkc-THL zH`G_%YaT7w@%WQu_og&S_e~t3INh13B&*xuz#na^dg!QUt|sE5hVpW)cTg`#(H|90m;s$@p|qe0g<3#*MEdx2m-K8JO|0$ifJ+f zV*T+7VxHFOQ$30vuWwCi&{f{z`w>;NN=)9yg8T?xR>P*qQc9ZQ1d801#5hW(q#n-_ z0k>7pzgGg1CGnr{QQ$onm*N%=MZ_s(INcU@hY{3{B)>=zuiXp@PGRG)dEI$-r$ zf3I?$$id%{Tmeb=f9jZjw`r#*jV8nglnMLu41fCX#XHZc8paP+G5Y#PxCd!#(-%s= z{;9h&5bhj>OD$^yRjvHrXNk3XW;MZgsk)0frDG$$EqZjk7opCn75xFav~)1&u*cSaZxPmGg9GURz-~hN^nsgAGMf%XEi+iZuuLb z0jp-gktbuVMZPFpIm2c7#^xSnLkG*f!k*fdb+ePObglnGTr4vanx<3#LU-fiYgDI|3r z=DrqxM}*`w_$Bd9XN{t+D2Vy&=0SK!r2|BFHfg6WXrb)$+5t~KovW<~mC{qFl>!-+ zgqFbm_7ZehTBoN?J>gPX!BFat;zf1`wC%D~!f)R|0001*L7rn|lvGS5{{xeJgg}t0GLTS?7b-gu51xFKbf-lB5R-9Kdmer7Xn+>4L-eD4BwB!<`%c$;GHMT2gA^odk7nsjA zg9MJdUA_c-6HU^N5RWmX4>+6nPg=dp)M$`qZ8u&xqfs*^kvQnnoX81O>?yUQi9{~cg$uPL5QEXpWXbZ1$HiHO*D zdExrLB3%8=4+S0@gB!lJ#ma-0YfhCR|1euuf>I-ezqYVyZ#5;rxN7d zqhOba!d3@<(bAt5G2R4I!7um~IIay}fHXxriqdAF;+m1EFpf8^>CG$PEcUqj7gH@^ zHH>IiBbZ(7w_H(}CEEc1SBU4JI-vjn4d_9dhcyT-sWO-ffBynir=EKz!7E|yc~7P8y6ErA36)e%_@HPmT`TqI z(!>CQ-MTPfq$5KCCki-kcm7F}1r*!ccoz4j-VO=7VI<$xq>Zf3mV&%OR+Uq$d`!E{ z@NA2*Y{-zxU;u+4`RhslPX&5n9Mm@Six_=>C7#Eo}=RQ9Z+PR_PZBDWZx9iVoycPauj+ zl-xB@O&lir1xUCuYsGon0n(DzAcLheRMU*G(5!;l^?9qSt`UqFey||hu7Mh@ee!pq zda=iCv2$KMY9`+m(1roDOA^X?9$w#8bImca=fN#+wWc*F_3<%FG7EXv+zqDAfGr)`Zcjy7>{2xC?vC?fp19rAs? zV#IM;Iryxh_nczRb0ACu`W^x5!yK}?>&!XhZ_o6vD~4W){uhx`#(E!9h~~-5*(~%y_U@O z^2zbY*iDqz7q0U7Y~LZ&W(K~xH;9ACD=k=w3Jfi5S4+?v_1X(8O=Pb=W)S&5M(}Iw zj(9Kp(dfLKw~=m^b}wRCnrec~&&;fD>^l!7ALZt5<|(pLpQi;=-U1a0SWsU0H+`P8kOn`QZ&w26YgteMiip^HNiXY z9gF`nM*69r$z#IprwmPtwD@Lk!gw2WqU=WJY1nquaUw?eS7->Z!p7)7;w6Y|H*g8s zo!oWju@RX$Mt+9Dv>V049`)8EomYdk6?TuB-I?d?k4z8M5Edt9bcX78)j#ziM&s&D zqRIwmXH_VgQEc^a#s{7>E75#{cucut)&igga0uU@#~GR%`ny z@Xwaa%%GOa5{bd=OLDr3slwr{DP>siNn~U%d`75R&+6$+Xv$p4Z;{lfzx|=;W6J_K zzG;BaZ2U`+Ln2#^&uUw4P#t~W8Ur{E&fPH4qNIfbcmiiv$3ipRHQE>0t0DE+dJPz+ z4_8c%%p8sAER$v@I1zP8ADztu{ddMNFH2_ZfK(N+vCE-cV*{T}e%U}$|KF77GI*(6LH7?ec+j)yDjF^S?9Y3)rW`S`!6s?paQk@YLIu^QUJ1d z*GM;Gv6v3^zyeYfX{L^!*{%-y%_=2Q09bmsJvFq6lD{`6N@T0FZu7&4=`MWcqDo#s z8zu{KR~4mMFiRRX+^z9xsKs5W=9QZPGiYs8H7Sji`4x2G$U)uz?zoy`chqOVjj8FT zlmJxK)RolE`&uUaStKT_4CnsdEqYJL8K`F3R2!yCb|RFce1~KoRh8 zIM16o3W&Q7& zsA6}RbkY}UT1qu;U^$r*;r!R{6o5w9S8PUy@i(v4-%}j|ZT0?lME*>B_<}N^MR4uJ z6-KpSUcEnO(_D4BiNcDvWuUCl^G%<5o;lcOZ}cI?T~=><a0WH2O;g_y-9rp zyO$K5=fQWHdgRjDjU!usE!)#8I{)wwi;!Jd| zw(qpnm;tIyeI-lb#?4$7lBq)jJKpJGlf-Z6O=0rC8TbkQwMDrLcU=BmadDb;B;F<% zV3ehV38zdt3C(_6W{b-J%4)`PnfFI@98rgCT?*=;fgc6Hz{xS!+P8iG>k$XFtWxB4n|Y{Zjn&FPp8V5t8^@i`;D!lq*N~!?`Yj70^YyDvGubbf zqB~DC^SHmF?%E#$9Z(*{I<0$i#%z^F=Vp*+Te;lh=NXz0c!VV>VOCE+NM;>6mo#q| z%?hoj-El^gTAkGz8I0ECDE@38?Ky^g&EWPUdna~r(LcNi%mF;5cB^L0TNwbHpZFp9 z4|PvG&l3|7I~_D)=Yf`AV>W@noHBW|UF#bX{i)I?fdJyVl!U=`s)`UyY+ zFdP_l$|M;)KL{x{j0Fsi1bvam-gxCEw*WJwJlfv-J!F5%e<1xs@P3_9U~IqYg+#2W z45`&!<%Y&ojpe%B2+6ht2>54`c(+aZ_hcqFwyR8PmHkB;{$Cdp&M!R0Wp7Z#TD>e4 zbX4A6#kD^TS8;gWm}`+lod97Uh-oGkUWzs<{rG-ykqBvo((Ot)%O+7Hh@&tj&iUt!gazS`S;>N4hs% z)rv1WyDW;*{JM*kR&*a0Mus%5?#`hPJHu;Hl4Zo(7J6)bNxFV1Q`FQbmcPBSbdIp@ zO`FnFL9(j*Lg+=%k<4|I6}MHsU%g)^xF(9JvKZX9G!T#(-GS^Jky8|>%}H(Y1zlZW zvwwTDM}RUq6bq;d;a|SU1S9WXxQW!a9bI;Iy^@egra|5LhGyEqKd>Y z+i3DeLniU(zf#O@|8h;hr*#6dBuIFl96PIX261X`WSK&iE((lmk!wO+Acjn=Lp_Nv zXm%W#U-SGR2g2bz{zlzbHExgECo6B)ynJCd;r}M4#_X@iGU|08A?d(@_Z_hnbU4# z8pTmAjgA%+JM+Y5!%XPk1m})oy_$ucB{HorBWKd;BM&JK3zIrda&l7ge%a?XY+{^d zeUnwE6bnZ2}!2AC_ir+E7ECX5sy-r3?&u|pS{j_<=$FjYCY)#l_ffK$7 zI=f0zWenB7sjxJvzUQKtDvwtYC(M9MdRv<=idaaSUDuz{l>2jIeU32DZt)q0Eubdo z2YqdYDAH^jyH(&MY8AVKA5GDuO;+52x= zUw%$Hx{Fv(Tig;I0!sgg2>leoIdsjjaYz-zG)tc{{I|GbOL8Hh^G>VQGi{71A?F5t zZdeBbj5ZK=wo*+Dw_b6zQqGzxFR(ca>+`d6V$n8MO3BDE&1mKkLL-@d@n4eLIn&=! zAZTDhRLoOBwyDY=+;HiggsNXm<6ArfvVsI;AOZgX01Bx=ny5S>OJvGmA1sxyY-c5X zIh_3GnhHMCk{q-&Om21NA!&Ge1Xh#Iq~9ZKd056)!Z&|BSVtG1+f*)(>y_7ltRp{vp+nM!AH~`rI)Q)SQW*m~ZbYC?m=ibl-su zNt`)lJAy=1CYS&KZ}mDKd4^K()P|v$6C&BZ{tO{~hdX$A2X~W63g|ao zM{hUrT~b!}0;_e}IqM)CIfI0!D7CONQSton|2AeA3NT&%nr4@Q$Y(0pRe<)oUbsVA zFn%0zr>OsK6-&0FOc_FA_{)je-A2pXvjI!t4%1@*(`t8Fa=Hj=r68_QhhAlVF#Fjg zpOU;lh{FaZ273J~EuJnLi;GMBhs8IXlvf+qg8b-CuPVjPBSS6wDZd85tZmV4M_x=t zN6`FHjHl)57m?S@2Q3=TAEIcs5i@Hv`YLvihZd zw>`a*IQ5aqizUnnEIKP3{=d0f07uRpW{hg^xL-l+ofg20))dsp@I(MVLBRV-2y$); zdD-UI`w0z>(q%by$<$!5^ha^C^1!D$SD}6|Pgl#$wY~?eDl~ZVx>b}vt4ghsU-DyU z1o4_qyT52IfeO(e2aUGw-B~~eP$wT-s&^qpSVSEpto(|-30QIY6lNa7&8%&sUg{Wd zvrSb&2`J7YrYd5V!i9LNKQ1>BZXcVadW)gJ2Lx!;JJsjfS5d{heEgcu%?>mQqO=wQ{2rH>c};+f0!!N{wR2=!!<0N3X464+xE_Bpl1gu0ckGqCH|G zy=M!_=1%Zv6cI{Yh1mT1&bwu=bYznG`QvjzN|Kx1j`H1!o@tpX@Co|{^nZVd3**{D zSTM2D9t7qZiHg9{eV|@rN@q8Z1-{jhPfBb$h5xy+GZ8%u%B5DY`Jv=UVCI8WheGgl^ zeIt_LIWSQrW;Nchq*gFHU2!k>I*bHTZKfCLL%2d)UqOXMqI0qUEnrafC)9VBn7y~? zJ~wbD5S3RN?~D%G<_f4XBjA`x_@*NBt6Fi&wp_M5GL-N(kgv=HZs*mIg^gjLkqY!< zn`6BOuO7R=sV*jt+0ByF?Lznr)Y$d1JkcU%pJgGBMNz0ay;w$x9pEob;nzz6U5&;N zMO$q0iChY$Wn3K^8aZ-JZcPL-5H?d|FO&+8(dSF(sUFUPqdpMpX#U|j`Eq3CM?K>R zu(f0Vu(eZmB42UmgmsCDRqy`5sqXz+0X@qBJh_Bn^<0+W2CFfm<9zu<3%nT3Vp)6F z!LXbcyEM2zHe3Ua&1kh}=!Q|d#Yq&xW2fb{y>C^yP>%;AR*7wrlxLFSjV2n*WaZ3b zxZCCY3{h!ZSs7QPSssqpDG2*em%LP`Y%0OuLaSpR3c(bQ&lo~KXjPgG>uot;`c zv`Y_E?mcPT%lq&pEQ%n7#OPi*fDc1En^eTCYuF;706M<+9;I2w&Ta9KwHHmkL%(kw zB=81}&^$?xkGGjbSn??SY@LFXyg$Eqhm{3UKp2FFFy=h$UFWlj9}G$`3b~xvunHfQ z_M;v&2aNl{x#aiMBdku@LQO(HaBnzjnZiC}0SyJ1oez@hsWC)Ts|8%Rin0~gSfQa` zQvb+ymA*D5pRi-Q;cvOpdB8acX#Elthx+gq#ijdjxrG@0a$7M{&^-oWJlR`0i&SQp zWOFWCc~o`w!oxA47~UNxnMZWccEFcm=OmrT04yK^E&RaUkv1+o?Z3Bu;Pvhi7Mzpa zK1Ww5*@?iSZV;WzjN?7bd55VV917`tgkCTGdkUq|hUbxC4c}&ZhjmGXIM?)-$Xd=X zmt8;XTX{)bbd#3sYALzOYwrDcZn~a5UL92o+usV6BZR_F;q z>pZcZV~u^cWF&{Cd^(1O>zJm;Y~CrpbeBNJHn}zGj+Y))G6AhHRuJoL7ID7C>mU&) zuH_I(RBgPso397BLsF+jK5jwXn>jHkWQmU!mRMa)UzZeiCw539JmntiHB=*41 z;Au~((mJHC6r8l?Ey{N0zn_fidWYjBz(D%Lj*=25rTV2p<2hzwT8yZ{g$^1z1o1A< zCLh`K_#{L5q|S5f^b&*4K^BT+P&_Q_*tzE*4Q97YK?lip?U-G#|62BX<3YLcQcHsc zM@eG}b(tzkUO+W-LMsLyFzgvfEVq32X@;nsjZw zNXAA>1HpaFO+*Wh!@A>+Y(Rw|5v2`kz|(D^&5fi}tvM|WOk(A>)!}Kslr#*L`A6YP z-y>$yIp8^8I3)B=pmU%$Q4@$Bqr?{Ey8J0f06?^FuxtG8e=V|m=PP~X<@fQA+#m;c zp6e-J`;%V*<%2@WmocQ8*R+G>rV{|KK(yMa{v@;CYph$(@!AfGu~OO4=~Psn9p;f| zemURaetg_YF`tek#!2ON+T_wdYPReEwnO`daGF%I0qA>(&P}BD&v><+2cQGhI4){a zAxY1Th2{2=-GeoaKN(?F#YP}@qO(Y)ao)jO!~u5`3ZT*QOvZJ(kPswFKZ)CL*tHLo zE5cC<=8eYeiUq8uFRShV+}XaVkQvDtW^P|Bd^gnMivpB&fY<`#41DmTkZpWegr|W6!XefE1?2GM^DZg=^c)qq6HFWt%=jQ1Lr&8njWheO;DS3w!&Pz^->1nfaOu_S_SbLIkMk4LLJ;vN=2;+{t>_%bLSYcg zET5TM$O3)sjm?)s;DaP63&?iNV_iZPFC{^_$1BVLl5SxMvHowD@rLZ%q4R@|8cXj1 zlOOW(p#Kcz=A*X2HF9M&2p##eb8Iz;4EKn;Je#M4;is6P+$xiW6r6saiRk37SUFNv zO}O)82P4&EKz9j4)NN6AIT{yRI=sFDGWJC}xi%N&tsM^`9nELu5nBh~&+aXWRwS9H z(!|VNFoQnl>(*$T=gYS_jTnC;K=D#hpGfhn#hTpXHwo8R57%ZXR@>&dI-4ic&J?; z;U}V=BR3%w%@9O9)y!M9g+L@a^el=I9bXZZ6tVjax3E-wgMuJ{lHAy~ZQHhO+dH;r z$F^;;ybWlVmhj({1~@Jo z81`?P_|`ZhY-s=Z^!}td*=v(8XNcdGP{K#wXjID)*|t-Dv+pd*1bargkZvm74_FI~ zda6T~M_z=R9I&!eOVG)bhV*SsgF=dJb%jCXy|!YG%GkQRIkREW)V$!G@3rY;|OzxSS&;kHc z7q$7E5wd3j*gI4@6_nU1ItyHY&$rB;%&~!u)+>R+C8*fdyMi`)`cmwRBBxpM?1qcQFe9!)c@8;4?BVafsW_M$ejKn(%=2C5-YKU;% zrq)3$5Dh~mqLQf?6A3LmAUHY3pynXJL=@X~_?QXJDuSP@-QR)X1YumBp0>WQHl5Xa z9r=PWbd6+~`rmLcB~EvCZrpb)w%Vt4@@>2wFU_lQ(8(A2N^XTOc6h=pGFjNwk9lS< z=`7VJYu~>y1xvc7*TgjZAQrRLT`Hdl;M(*MSCwv{yQ(7HA2|{h;sz#4t|CX5Zd5K; znL2GlB2geM!aDOOtYCEw1w)0sx?kaml}?e#TywSXTmmwlN7N8b*L`i>q9sz5%Ww!) z%vZNUg?Olgp9FD3S)UZqQhF5Z!L^%f!}~7tzvvnTeydN7?M#O}7S!0SsA4>;h?)Dh??s(3BD?(St)g$1f53n| z7+_g(Hg~75zXYS0oB=i&G1Lq*B@y1;PBzv(M)as|C<{pt(^tgsC4l!&;)1F$}#q+ah{yl!B=qAwA^83h>!Pm4I`s&w+mBKKqg`M)kT5XcR^h-~nQ(T~ z$tKwcDMMXYnWf^|A!(d^6Nd zkqnj`5+c};+_6`q`z0Mw-kHVLt(d!XBSY7d<-3i#ee6tC}pFqwR~fpSOQ`i~~u-&9mPmXEBQhOl!-N zHSV=fr#5mz;BI$=VEiVpj(;{LJKlf7fhGL zk#xY>pl&2#p|YFG*_#0%i=q5kZc(R0ONK(6jC0#JL{d|ka?~t~?cgoM1SUy@1hK(} zl^v9i+EC)bDb#^H^DO^oc7jHIAJSuC>w0-~Wc&sS^SP(?3 zso8wn3L74yVEY$G3n$OOOqe{X4iG0GU<~G_*l1`WC7|MYsNu;d;;;ln-Q>UkAOGtU zRI`U4G5B%tH|46vCH9|Y0UJe~kW5K5kYSnP1J>)FiBIWYCmbV~|Eo^7!Pg2Rvw z8FbEPA2#ux99ckD`kCM~u+{EUFB;l2!O8wl1=s*vayF$kxx8OVOD+3xHW|-J;yrW! z81P470}-!J{;)J%b7@87){6nDh&0eG(n%q)b6J_r>3JqZQXwKcITyG%53hJ2POT@=s;&syLuz%;smLh$@IxLrpxC&JB_lfe?FHz!c0%RazDgE zZ|Cz!LmxLXGUUO*+Y+nSRqPPNBa>O#TYe-jBh-Lr2Ua@{%E>mP-1HlWsbtJjZnko@ zgNz}(76!30pIP(WZY$%jdj*i!qzB-B=1VKY&|Ah8`^u|>sDpI*28I*GP87m?9{yyz{il*>_U|cr6|GStjiZX5C1H-7l-mVQ?Qy6waLSuVu!-Gm9u9C zT6oEXDlRE3EaF*SIFpf7mQKHmdW~d2mn6VuC>}N5yRdf@yB9`->CFU;TI5J0*eaE$ zDdGAkt@H?2r3q`3y785GzNL7DP%{DC*zMEorx1BBQri(mQlif+kthPt_2Sq?c?;yB z8O+Q<1q=Gx`lD2W^-IL>jQ#6yqk@^V|F*^lQwHE)mF5~rS4{|Zr!%b*gB2KV8Gv40rqBHOP>$mJo#qbMjM{r{HU|mCdH~Ic>B8 zyQ(ahNLIQN@J31GRs*#anL#RCL~OD9-=pM6;IBQ`6>`qr0su##P@(HCL-N55cHPV&>AS?Ls8mVC)>NjJ1Q4mUxe8C;@hnQEcGG;^o^P}{| zhG(1)`~^jR3114mNAw!Wsix|VJQB6!&w+buF-Kge*NS@ZnYX@?+loGD`nN*FrTvA2 z_XOa8V;Tz3N?7r=qI_+rySsVFhRKs-w-Q;BNH8T41t9>~Hja@-R^bWH#Zf0Eezm0T z!2sUaD45_D)Db!1fL$2jLrZJV%N;yHyB;^4-co9ZZgJ0`C^#)E--c`1 zEzCNDa4%c~ehpJ$cP=`1>Q#mja)EPxrM>tnTdqtWqZ0^5{w~ry+6%wzly}dy4LVKE z8}e*>3Gw#z4{TT}TN%_T4wl=D^At!gPRVf|2@Jz+b&8;M2oFL%cAarboKY8m$T4bV zR|h#ab`441dz_sM&jI;D0uTBqQI5lOM8UKnj85XcUo0y<7??^RevxGvi2@!mk>cNT ztQg4cjXx^uLgExP8nN9S(Vwo^SZjj_7n4`&qTpVgpSP+n~`+{LF~I#m9!Ite0tbuA!PWHx5pSkH02HxMNXS zaBWK!U4^X(fnsF|k`S3$`k=rx*d?PMrR0VX02BvLa-`jC{k&{fv;7$G2bjN(4LhUS z{RaS*1T*FSQ_Acc0dpXM3HU#!wtA4&Lh~>u;TIR|w9*d9X|_R%5q2P5WrFaxl8XJm zz=%JieIj^Kw3Pnm0ub=gQyhYAl$>yz#u2%e`Tk4%D}v`*d<6D72a+&SC*TYX!~K?x zI$BkculFfXq4PkIqbd1nw1Q=U*kKmluRPY16lg^L19ZB_fU8`JoA5B#N95N98#Hbf zU#i}evH?B#u?Uj1)7BAY5{#KS3aVz1f|HS_Is21aV&kh%7QFzyOH)Fl z(55TSy4W#x!O&6&w8h(g9^E6=KOfwT+!YONY9U$f8^)lK$nwu4K!BS8KsFJYxYK>W zBR&`#$TyN^jT~j1^!<~upx6B|G*NR_7l`G_g=fWhv)zcgW~5%LOdsLFEOP_vCAM$w zXG*tXC=^s2seZn_K(Blkj008AmNlTEK9|za<=?B(4{|pwFP)gfuY_}h%b}|*#0&l6 zmJKS|8o*J8B$-$U-Bl-^6^K#_sBL7Md&C|Diq6gr2X$o%_z~jRo6>I?opHo!uw~P0(i_<;j^nALQkM{At98g0pF`XFG>=^K5J=%aqm=S z@=WdvZ0wWyCP_)>&430^`R$LjlM%XJOe;o+=ad_(FYX17Z(1U9 zSKq|HVeNtI0td}wU7_IMN;*==H0e1HNWOhBXhE5SFJIoL$~W^&-YzLj9A(y$_}dl% z3%`>jM!$&F_Q($8dd$f$0cR3OK!2ECVl3t%HNEYc<|wdBOjoJ{mq+l*@I4SKdR-`J}Z_M$_x;zx4Jeny6hg(0d zZpS$(yAv9op#QS&WA@rR(ytR?w?;2ZzdB1rfnFl3HJHItL!NsVcWJi;ble??)!T)1kh}!# z6k=s1SrV`ayTCwCGCy>KtJJPyp=CBmAlJLdH^a1e4FKl|RPG-(dxr=qf|4-`XOeZe zatB$3WD{yTayq-^ZyrGe6^d!-#c_Ce+d9M`Mt? z5g!k4rf~1evwN+o9-M-Pg@eg0TIg@k#~9b3wjkaqPCkwth|R%Nd3823>j+NhE}|mO z)l1#pHnkEvz+SQwoz(B)^A?nITvxi8Aw^Azg3NL3_EdWGXzB&-kqbvluht`Qw`x~1 z=#$p~{kaU4VOXgMsT3aH*&KsSCMJSAuJcJQkB{yf9Iva3KFK|%-$G)ah(W#m!I*QA zg#LJyCOiV2v`x{goON~QMEuGe+dW?&4k9& zg+Lra61%k=^$!dJZwhAG{imx|?JA)(JeW3jwc-CukOt$uPAB@aou8#?(A3}%z@o53 zS(Qi)MvtnbbD(<=Y}Y?=iSF8Qna=Yv!=^dnBx&OIFRoCZqo$kYZ8iWmMAM;06gz0! zK1fDJ<5;-=uy}wW8M;sE7V4%EUlf1Doa26j_U2;E@Nj}?Aqm& zwtQY(Ia4Hc1^%K=Vv`ekpW;mW#EN2%A&f{;l>LBbm=wj6L}M8v>|0p>JFyd8RWVu} z@N`y%&l!fU#iA9lI*UlVxsW(!^bw5m@-ePKqdPcad1p5hEMZsvU;Ot=JTUgVAVMeC zsWIE)QPzh9E&)h4XdE$R*p#iy!CmV#kU8d?2`fTcnspzxWkdKP-lg@dg3|o9p5$9g zxZki$>n#A$j5E0<>Xg@67=c=0nS*%c$Ctm{J;V~`?np1}9cEy+rZZ0aw}aXS!Io2Y z#Z^hgx8OQFZEWv+9#sU8PF5PnxH$K^!`ce~ehRI+Jwk+X`gtV*RE^K|5ooVHX!0&J z%5uIZt2p2)SvhmmVg$TAKV4&8lRXwpMB!#GG=)qhc6S525&XfT#d|a$9Co zMoYh~cZZII`daXBx%KaLje@?ppft=0R4aPg9yO%f6xkn-I3?Y^8IpcwF^A%>sZB)Y zdf!R-5;%Ze*?Q#!1|GBfyZpN@{JwYQfufZ}kQJT%$cogah0JBVe&%bxpnPagZW|IR zj%!L=TV5A^bKIK+38%@1SqrSASfV|TvM~XP#|(-Nz|NhUSw&1)A22tbw8qrd?idJ1 zFFgAx{gZtg4}>-wb6D8G>a3`DtOg1!hgf`#pLkfHCcL$EFHCZuZwdRMN>a_TXl%&W zb!3%TM5tEbkn|}h&vojrv-8rOTcBz(zdSY*HVqhRs;iOxdj5%EZlJkxOaY?>c`Gz!C$$iM07M zz5BgqG)0P2_~(4r)Tvc_mh`+>T}wu!)}o-UD(lPSH~Wb2!z%m*dv(D5=Qw}4y7eBM z=&BM)x(8?f{l$S%31-IsS9yb?BL9v>k>C2Ca@E8OD@U={@O4c@j>xG{#%xXStEG6? zU4bF!8_Vd};l%}VwH=T<7rj6X*q6`FTY#<^Nk(Hv#Q$OramhnNw{@(y-GzWO_S-A? zg)R%#d^&ax#1rFDIHAknYIZN>99r35?2sJ~)Hgl9t)D+d9!yBjVgdFFtmV~Z+pyX* z*XeM#hN2xc<-B}2ZUARki~>aBXUpkv;LM8>FbLOAO;h!zjh5Tn-RRe4Npcyx6Vq@r zlm)X5WxIi7<-qGv2hjGQ)BF|IWiwnWa6DYrqAv3AXO|4jb00Om*rZgD%7ND@SeM<% zq1=`IK=>YLssc+)Z)=`-{+kbca>YjVH$PU8U4Bc3FiHBehm%#m%0)$Zfq8NZfnd*a z0vsCbxSnCxU$1Vm!Mj{tC8C(~Ro(R9Wx*_W1D%*)kSxREv^JXiU_=SP7DFVzz7xSY z2X2tEh^gr0T837*=EE#rL=bn&T4g=1KG1@YMt@WHw;i6o4CaorwdS8CWi5^-V1)7G zx@yXGo$OB8`29$)1?X1V<-yq0aPswYr%+PuXugv9+p6=z(-$E;*4x*Nx`%7ppq?$R zDr!*}>!@E9kghY*?I)4{(sGA~BYX7%XO-tQ$^HCy@-oKcJ{( zH~-Pfi{e0;G+}%PLz!nbgm!WrJS5Epnj0$JhY+gMfRLNt{>1H%&OwOrlETr`Hc1UZ zzbmbfL}SVPm;t^ek~Xi^0?It1J?wh$CrGj2V}IoLD`-FdBMm@m&M;BxZ@NGfYV}ps z%BU>lO1Wb~jnuLcq-EcuJBD+W6`!U`9nH^Sw@PIVyK3W^v8n4UYUw*wi@YK)paC%X zUO36rI9wupGY+s5VKmSUqqxXj&~D#Efp*~nD>YdW9K-91o^6?HI?ks&5iAADo%of? zkF$U>#4onRg0gZtDiux_3nban^jRKq!mdl0`P_><53NMp(8rLS`#-;~lz&2&7%SHj z>yL_)*r2HPiYF5ZggK|cwL8$KgI9&VvVlu6)|M`yG}@v z%i;pS>{ncL?uGaS69SzaVMzWkdvqO$F7%hdF0MET_Vb9;Wn+(p%`vX}Mj?+0-8Z@0 zPdPVUMOf{Yv0a%iwY&OC104Iuuu#FOGk2R9{y5qNT;-k{1uv8ZaDk^~!V`ZdC(>r& zWVF!=Ctz-yS;hFaNu3Y$$;`+WDf`I!>G{FvmmZ>3eE-Da+Y{7$GGlmY8U<-9uw%SF zfS|{#OG_)K*h7=FfakhoB;qS|pz?eK-*A{)))>+Nk^L=MU>%9hW&I@FbDDJmj*q&U zAIH|&{j4^Xsx3CjGxxm^*O0(gqp;T`XKmC$m^nKVh5$#0eB)6^Z(&zE+Bng{t&=Y4 zDqQ}+mNRrQ(JOeRHJkX;pdemCrdpXC<{H_1G8TcyKQ-A7E(NADD^6!~|3biwI4@P5 z{DP#hus0lqfNUd+6z9D$L7(@cC$>u{G~YRByG}~S+pUC(vb#)V2J4oMoI1~F8o)4v zz#W=hl(J#SLdQbD0ZtS?#9Arfi?ajx4sUjBCZ~-Unmj?YR2BqhchwUVE;Z_?dGzE2 z-d&Z^DdcELL{0#=`{%q8%Oq{&|RpS*shr(V=szR<7DCxg!@e+4D$Hhy1dI{|D7VR{P`c^c^P`AZ1Yfsp$kPC)o~D_W7hWqQCp(XS}c7fbQh4n z#L>-~&BX$fQbeR&HA^K2i#|6?ZK%{pn&Hss8kp1{9%w`-+@nNqsa^iwQh{=3Fd~G( zNEP6uZ$Paf2bjGCa)S3xu*B|tu;`A(BHfaw|Ac`5tqj!tf6BnIU}nSrRR;V&4Yb9k z4};Anv+D%i&S-7ujTYDYzjkMpfI2> zK<`ME^Jb@6aw%L1&{s${6pCCNv)YvgOF-jEuEZ!`*Dm zOLHL^i;6>gl=$+)4CMCKU-OH;EYW!7Sb7)Huw;oLaHfb#I}vN2qrL|oYZNhj?lcSU+*IE z(i6PyIUABHK^XL&bQqyfHxBC0%}eAkj z4AETlUn%5k6)L@m)Vl7Y?FoUn)(<&!zh%0iEdfX9r_^7Ro%+;u0ZuRyglsCqZ2#9; z24GNK=4=2}l^~eRTe35uZL1q)9~C)W6YgP2Vb_ZR+IPe~yvOxJ$s;hxO&zmoe@*o` zhwO#tRYKjqDw)+7$rK?)!U`h2l=-i@NMev!@ z*?d)Ze+W)0!7yxuvd8b}Kc>11t_?&vTuC4_1lr&|Mg}u1TX`62XF9QM3~5ATI9KrM zm@!*S<)O0&(UNc@g~?zpe#ESulS3_pg!|T4Vr4#j+(^5X8C^Tgni2oOU;@F+#s9!y zFw$hGGEgM^j9*TG6M?s}*thyeyYo5Q9FU#JK=SYQdR3zzz*Mq&OtR_{M|akXifg0A z#_+XT7vPg$t>coguam-UR zm&#vY`e2X{uezZOvEp&cjBk*i88x8j6pjT08D|S4u*7m|bK81u)Sam`rq=aAsZe-K z%}4Nb>*1C5aJ*b91>G%+&wOm#mz!c>Db6l`tuvMUc&;TmZZ>r4E)(@q+0D|0c~U$9 zyTF2{1p4O8(pE>3`KUOmZuOm?_k_@{pg_N#Mf>ULu1&#Z(>2RooM3+2G08rlJ{dnt z4bomFfAEFVZ$xtL!792ZHkvA#gCR!!$R^d(8Xd+NmB+ZM0I6H zUSHhQvrm-ejZ6&I)#nx`Lv}p|9qVKV2MLU~#W4yD&pb%=zK7NTS%&MbD(-&DBiwH6 zeV^!Xj?L!`q?uiIC9I?o{P5=?udjZoRz59fQry?Cra8YVIs(au%oTn3~Pwb8}hZ8gp}(U+92Y|A!@^4M2LvahM==uJR zf1rF?xdz8NG3f#FyCLydfxq1?wx|dIu0z&*`9%Lw6e37_srDktr=v8}4O8#MyMKB3 zD6~!|Zft6IM3+!8FScQnuWL(wWw8r280l#sLY0jM6N%OkX^a}?2Y-a6({rPZvVNfm+(9DIpD%rz!o>W6?@}%$G3h|ML&TSGlhqj9L&`$_<-#&JoPzgfXQI;g> z)`C~8#3^#*wxE`B#DI^KyY6zn1hDWyPpKevHUoTQ1>gK?r=5;YJVCeF{=lLJcTfl9 zt?ULnW$k3<;kTqYyipQ!mFWB7m$93VgbBO=*{!;J-_Mgj;>~?%^SM*;pOfS0fUBLrtlfSIG zvVdM@BWAvs8ZmS!R>iu&`-4hN{(XDnF20(@4dIegWjoC8wWS8s!rtVH0FGI6QYFu1 z>Zyl)5z6mtcW^;ydsHruREmzd``!dLuff=PPnCQN;?;xCY7;@*Jp~#3^_)t5GMh9H zJz*cL&iSIXxvswQa}Uwm_5R$+694yNykZyD`KgZs^`cHC68bPFfO18&yO* z`i2{Hu*_FDd11#uwDlwhTe5`F!MQ0wJ$R)*?77+*rUNQ#CiQ}P_rs)`2wwYR(XvP_ zr%cfWY7zzGQFxsC@}*Z}C&AIPD&w&y_)-(}oYJ-xzF?|l9HkH2M&Q;h#!X>h0-vxS zR*5#wJYn71dR;heD5J=bIZ?05szlsOcP!<*nI^`f0Ey6wf;EYTMrf7MNUXEKXZ{-9o6+f^rcStncaYa*bd~c z;8?h0TTy@WlC2?Pdj}<(hEB=@9i?8s`pwyqvlv6y%K211aPuEuk)y)}8VYGCV@^?L z7YDe@Q~vy1hIT|Lvkv_kl#9x6%Q5uHAcDgs#DL8cS!l;&BmG(@gZ~4oa!~#oRuTSV z@h+OWAYMjBzOv6T3^&S{Nq6pK&Ph`|4O6KukT&nDTK7I68ndG z3o3kA?P6q8P-r5~cq!`7`<^G`kv4l+_CvLa(;wRgo>$pfkeXfj;Pph_oVb+zn<>|z zIPVZrwNpN{=}vq(c{8ZX6S972?_S+|2)VXemZgx=awe1CbYMStg+rpZa9<90FEV3b z;_4Ll@iUZE_+KFG7kZo}%Z~``W{{^f8vuvDkNihqFOzgW#55~vYrem)k~piB1am2q z=Fj@osKgdRMbSqFf}^wnz;ozU-x{iCu4v)$*a$sKU1l`{9m30ukk2D(hi!0M<7-Gr zzoUp|iY&lDt8<9=r0f;Tn1qKWtjs$%=(+yap(9%5_w!pRV0B*~*Q{jjp64@tg-tVW z%AkIj9PgKDShp^h;o^!%)J`2Tc>Is`yr)IOicEnzgovH4O@Qc%TCBwYkhXv}DFyHw z(TJV1_7SSgLWk)Mo|T}IheY>R->^9ZTN>Yz8Uq^mbWBn@&_mEUVl zaqP7%Y6XpYfLzKZA^A;t;3$_6s!Z{>yT^TJaJ`JO7x*A-g+!==*H-ad*TWre7banx zUF+VZ+pk{V8g?_m!dpWv(b%Az%_RB_bEG=8u=F7iIj|9egd&|0m8fQR3hSib!DkTx zX52P}=`w=4w*x4Yb-HvA1x0x_!sl10cwj;6ZYa7k0~?`3sjKST@S5K)L?$0=z|8VZ zSj6yfXWsAd2c1`oArApBGS?yoIc}G-Xv!XNkVRz@;ULTsRP_9WT_`2c^8O&$TGZ(3 zQ)FuOp&Q1bxwdhoURo_E!uJ%Od)}iX0w&D5*X;X z;Cksv^@2I}D(({d#qIOg#vVI=o+R_W58$b<2d|h<`o$MIO7VxBOZYkwtLqJp#qJrG!$1E?)vb=M_ zK}VIUiC9H3U0Dt!T~u~S-wotf&x6@OLbYBRE^5_WFR9lM{n3GfYgqUAUX8s>bd$(# z#l3ioh)IGdGM^NeSJQ_qQJwxMVhV;=1nS5=;mfzXxP6OBRdu(ui;PqYy-z)hV%tzo zN9QRspDdP*{i%VLS=E>=7ubJ&|HI+7eq4b8%3TgS0wbI~) zQQ$}VdT?JJAa$gPTc+$dv!|D(fkz4;3VXsB;taqE6x!SDp2r41Z#yc6Kw(U{P*1`Qt40cn+ zj-^XQ_+A~vh-qZ3fQOHk0QHD~X)N2ksP`4acrO@R7zr1-gq?F3q||W$dXkEulPjGr zF=NiHfYnR8@&?g;-^7ObCV?*X^u@?oH*w$k$Jps4!rl=Zz4r)8HPCC_h7YIbNQNmB za-1T6dv%yDRYbaMBKR2irTGX;hxo)OOwM%J=5ZB`Pw2;;TuG4Mvubg7?QlTR$>y}n zI;>1w7C3`Au7~X9gx*imu+CF8pFe97)93E*WaGquGNMt5ydK!^8eJZfx{+RlX8{*@ z@WrL7qE(Nb@Vah&^#&rfx`EKURLF{5nlZy<@6>1mV%o~vTy9|VC2A)NVrYtm#sj4_ zwB2{p6yekK_a9Od@vDR#dsNaZI+tuGOUh+_@XO@*#HTVW0N*JBFyU*tx4^28h zMTMX=F)-=OnIqf7D?;wlJZP|fRArpYXTx<~bs;s&9NTV?uW?_Q5xv4as%ks-;5Nba z;Mt(VUO=loL7`LF?P9F79(Q~)*MmD{j6`oulqv%D-@VL$6Ni3#ivJoAZ2xmWfJg~u zaQIL7MT9U~3Tt(Q6nTY{oW@s5 z31yct+jIv+mH0*Z_vf0q3@lxQa};JB&oH(VG(a4LsH56$3>u*}q^RS`@x0uWe4NE7 zWg$&Y_oHQa4Y0&|==#WaTp@j9=+>5<7T)h__(1@I*!N}rHmKb9u5Jekyteug^^yiv z0;&whDxwO1fdqqsMVlNVW>6XDLxSvD0N`CE?~7gf4c}JCQ?6s_{>(=mh7HWXwAlr8 zFs#da*{$8Qd!Ii!UHltx@Nlk`o&S#joPQt$fd-k)W(JyproBp`2(dlBN4k0B6Gp}) z)K+*vnmj)J^e^-V9+O}hkZivOs_ztS&bjL1m^!%{{P=94=}k5bZdmk28ElM(6Eb`j zoL$DT4GZLQ$mrG_)hcw=cNsA^pjJ)j+xr*Ol)0#tv2>>jxb!|=9dk!X!Z{A9IsVR8 zm}!1wmd4Xp*t-Nu&GJabiE zexZF%euJiqgI1yNY}D9^^8IKF2=x5Vrr8&Yb_K zcNo6$DpKWeIlNVZ#pGk!{s{}luaYFIc>Wl2L|7rlQjYQ8jd4M?9@rOWCIpfJ_VKLN z$JPo-oBt}ab`|g*?L?TJc=x4C+~zJzbx=HuPmQo*1lXRg+gX=*9x9Vhonm2oA+YQP zF|kH%rleA_o{TiKXG&dU%N$n`4fxy4HHR8CYN-vedy_B-*Ebr-H=nLlnddBtDy~vc zMKXc*X80-3-?Pfguy953#n$^34PVS)r{ybK<_KV)-MT)(hu|81{sR?hLeW72vS&Mew3h&?&?VfheRi{Cx?R&hnv*% zFC+CIt)HXu-|dCqAKR5Bn4$fj_5w%>WJ&}}9ji(5{O6r`x#>1|KnG?SQ&HoZ&(YSz z7i^r@bXu4^BqiEXEI_mjpwKQSgjj=?*^)r*Y4+%^BvZzqL$)D!PmW+zP*Oms7R0ty z`+lf;L`mTG=Gqu^xe41#eNKCE;B(VX>IBQjdRS!;H-BZoEp^YS&msQB&+C_?k{`HP z#V9i%cB%A`(Wds@v;x8+pdil;+iXnTLJe!;?v-EM4bw^#>_bGzP~egEo2aYBR_Wpd zIz&gqz?8wcCg8V+%+d*Y>psdhY5sS8OIbwvglO zNKrEIAJ8V*n$6J#QS@#h=mf0AmC&vOf8qu)Wo7(&T@H>OIe=1G5GGZ|(2*&pGBySIow{e0 zW#_rhcLbAB-@kHfyb6nx1Hut#6EVi^rd}0N4m5GTnj~IId^)-#@=1)7oC9hi>!88N zRR@9|bzl``X8rk$w0EP^cYF9ejNh>hp(F?h)8{kc+_w;A(DTmz0>s;Ri)j0Qvl4u3+N34+0`HYJXFo0zAG_ce;X;p zreA>#BpGcy1G(B)0?9%2&njq~aEi(T&D$(&P!E5l?yZHtyj^!xEB0Qnkejt+;7(`F zjt1+r6i_hc%!F9@?rWi8lkk24jr|vky11tI+#;l^WKSb(tg}Og&#!{OIXATp4!_zmgc2YJ2)PIivsvXI6Ix;fVC!SRc1K zQbo_>53wQTKmqiHV-LVbKG2k(JWJZXKT&;}g_(RF8!kEa$-R8BlLnK$O1yJkwrtQI zaUAC4J7Yn!s&~!D-43~u$=ur-1bk`7&K~A(6?zx^X1h@nIFx5-mZ9dey1m&^t2!@qU+18IS7@~j(TsSi)Ak#~Zy;1^gr4pDKQ5&Jt-eb})qI{q#FYZxI zYjIC^$K-?t?y2M#u5ejJ2#MiE{0dE3WZ43C_JJ)g6|1P+M5elbH&rWKf@%PVD>M2R zF6o76_S-e4U%bNNpEm@W(^&ST=87We-FvBr*&V(K@JHS_%1M zQmuihaT>0LOVYk;XZ1b&x!FV^N%{v~t1sxuFzpl^4LeoOA2nyEB48KWvTD*NTA8*y za(N43=kNGYJx}q1W9dJL+{XWTy>_lTkGxFdHC$3YpNs=Umyk2GnGm6!X8{kQ z{&N;JR?zt;qcSvn*!(Z5-YHC!sK)|r)brZQHhO+qP}nwr$(EJ@en0JNLEf zrM@I9*}Iagg@K1NUtmiaC=KsFZ(apmXea zPn6}8PI20?fKbD(%pS_opD}}3Sr-_BP#Oa}PqABksrWwECn_1zNDgqixs`PMdXQCq zG19MASY8cFC0`~E&kDr93wun<21jLw=gO1=i~9ayb-cRLw_-5&eFlk;&IX`1u>^>$ ziVQ#?vFJL0U4vUn&#u+?k{yAMlib=L@bk%{6sy>+$?qo^)Gwid0p1t<)GExfeDryU zY4qz76cNk+Dt2Lr0(WsBgYX#S6L~d@ij{|?r&<$Gy7QKwYRp!5>sJZ1|9af18AFyl zC}p@m>0d2D$5?TbQ2uV~Aiw(j;c?!w-QGW5PAD-a$uKdOr;CQ;n&Bmh+cz-LSfDle zmIXj6u|J14DYq+m>&UXUxYqzY+^74?p2XF)s0T|~LqkEoNa{AN(g&hS$Q)USW z`|_Enr0_adi!*OAh!PwGVLd=?4}bO5rD0z1Jh1>ZLhs#H5G*%9-+80zbLniznOy0*;Y=)TjTLKu&prCmTdAM|slXWWM~{|s{v=>x zQ+1(VdtN2p(%RhwcR0(`guCKkvCV@sxfR-bXht2l~Z1nM6KGY@b{td0vOAq0|>RXDK}e;29|N5cm|mo=~>)Mi8iw*U-Jw}p+B&Y)&I z3CEH@VWawAh-->5N``MJ4=@I2gtDsMb~g7OJt%2;%$)?&mL{OG`L<@J+c`3uXi2M) zCqskpi%Ji0-6`Ijku#{GTwln=0NXZ_E=d{UZYRk}+r&UBw?v36OV%ldNGjfLdjy-) zIUMbOeE;RzFt|(!c9ulGfl^sAq-nwFDDTCUik#|Dc?%~(;_7v5K%D22O-*p+>;$5A zPH53Ul`s*5P7$OO2BWh_zzG6CN=mv zhGWyGra7@z={1D~re*zYm|U$~=5RQ^ggJFEVy|2xiFvTwgIGY5#c{oD7oO2?=TrB6 z^Vi^74-5(p1!VxU*WhH69Bmd(iOg+6#5OQkWDNmYN3#@O>xtTT{3W2n37nEnMm#Vn z!3~x4vZ9i`Xb?a+42PdFNBbr_#dt`fsRXXqBaJFO$UyB8UQ1GN$~p&g`av6EQ>VPAQ;Is zI1pgQ&Yt@ktQzOYQzojj$Ngd30!s)t?&4FY6ivlp5gIsxUXY3!)y9}bO`t`&@gbw( zF<*0^SlKI2vYLj^7GLvT4_P84uP%rl;%8hbzJNmL|mjM5O<@cXzw@ zT32?Tfl1_TbT2cy(jpXoW1FJ_8q+5Z+DTebLAj@osXRrh104Y#?{lw-B{|ni0gY~K z&EVrAwv2M^22^p=0US+mC4pkt(^VAAW9x8&FJb)7K4~8zg;bRGE*m$ArGP6HMo(;+ zeC41Ck;YPULRFa;1MVT4L!RU87uk3=faijqLsy<4Vs#W6?AWHaTqr5oexU6WEGZ2&M&!uX z&M(Qe-qFqN^N68HO4;#I%*e6(Wx5ohxIX`!Q3Oq?tXp=SFo)(Cdn734o=KCKR=_Nl zgk{R;k0n%n$~z1+2YakALbSz(t!l)i0gIOO<&t5BwUF-QWhm?%y1R45Yb{Db9E^GI z-xmrd(n_LCu?Tsu%3L^4!^v`jt$=b=T+653yCBisYi8ZRn-o9?e?(Vp6xXld*@*}O zEcR)|bD=_3H)TXvIT7oF<*p@3)?1U^m;<44x@ zbaYb>c=np*yf^5D-B4D$eKTTq)t*jOS2Q3e8g0qxIEg5c8YaPZv8+6yjsX)3u+hwL z@=@M7d&qZNa50#o`xaUzBBSeua^$OA z)2y0~N!1taIY7r1tvvdun=!tFbpp3vw?I>hD5!X@MGAhUi6RL+N0Y$QE5Ane&i@?0 zgi{*p`_ElB^MCKU|J58F;Y-i{@4GJe4^k9gb^z|b#Ak-R0=D5pK(UjE%%wvChl7cH zC7r*%X2p0=b&B@S?Nj(B`TztTvxRdspp6_z@Q#|#+|Utk$yvbQ-{UEBxi zoO!SJhFLrzZun0kLT?uuq=}eU?YocQsOr>_!V$k1f=6g8lW`X}J?+$uef9X_pTFfT zYdtal$;D`JNfJpZ&4162P#FC@hB{8LMlujuo31i=?-tikSK7<4F$GsW&95(DTAf;o z$dPT=hmrnvO)>zp+uAr4Cw3o#ut&9$AbU$BXo&*qxL6ycs1hBAex!lPh*ECLS&;C| zlfl9#=U1%N8AP~E+qYn0Fvds{R0=@O88$^iZCin*%fdyWyYH-3?GR9+#YYITQ*+`M zK^NVsW(KGD*1it&EeBrE^e#)Ba8EJC7c^f3SxiPBy$X1tTAL%wZYob(a^SpC`Z-?< zTQ(3|C7>I#qIaiys!Mdwt!~+U*qs;ru#9S ze~aBMP25#*FB^+)ycDa7J|=3$I2pxBWZEE?(hvDJ!m5Vnyg-ryePDlWUSV*x+J6E; z8?>@N;i#rGB!mj<85r*a;e5`!Yy`bG%4^MaKEqd2 z_oI0II}?w4dn(;2mDeY`rho~BWQs0n&gM$&P1{T@`pL2_TUG}h8GWu&=PLpRM^0R~ z37`X4RnBCzAV*87)2=42fEBArU8Xxv%2)<_;c1uGUnP0$nmi_5h9Nq!5Eo!)6#iX) zB1nC*Gz}u$%?|D-<`z8Vw*iHmjLXG=4&$U-s1}E8Q0%%GE_WINq0CGxoi>cD;Rtl_ zeg&`^Dlm|uPaayEBj`Lhf0Hj}%&r#|@6TfaI^E#5_=5St8E$0u#8n9Oa2!uxdqyDT zg-kbVUajTl)-fxbY3D`PLS+A#&L!CiDQ#0)`1&ZbSnyx{f3Kk z08r%Vti7)wGaRX0ln#$KKKaQYPs(sU8^g=|mL43cBi{M)@FTkBmcdY0$tvN&#fq`7 z0FI!sY+ZU6HLEeXtLlxcxi;xF=jrs<*zL!5o_$HFe}`i(V*9LWr^9R6o2ovkoX3H6KeX-a~I2=Hoi321`BDxs z1`INNwkm5>gAWY#?N_wR!CC|=KzSU9ci z@ppTajkSL0Xf!f+cVL=riaQ#k9e=W>)XFCh7EMHZQ}`*5kv{|QjBQ7+%AU*&`0H;0 zSSvDnjW-oD-m04i?52MC*IQkfw-OFMo7- zb}o8j;VG`tc}N8?Lvbj^Y|$uRPdD6Lpj~b51<#vaIIgah@EnBuL)&4&+KWp+YBmHz zM9}5{Emf!)X)B*tDp%ejAqy;uy(};x8AK?zIy5O=eI;JP-YQ{$qygPWt)4!f!5X<@ z;fHIr9PKL3##9*P7d2lfuNWlk0R7{#3w4_-T!1C;_UkPCC02^zOV9oPCMHL#2-SDr z5E>Zr6VP3Wj9SAdJGR}VPuS;a0vuOoXGMs+oC05e?z^Q@8yC6`eJfFcVgq?#8VzPt zcDBKb*g=dq2T7qvRlc;i>*DNAhYpj9jRjpPnc0g7sBmLLXAV%%_PV!vRWbu07 zfFcAVKSf&+B*ZwGBzm>ak%v1nx%aE3OYXq7$~GTByJP+T?AhMs`Wj*LAdMHgX7@OM zCVwKD29j#Y_f9W-JNeL)@>+!bASA?J6#z)ncW`r0dCr~CwLgP5wULa8RSnn*7GHh? zQhm6Bm<5ny{qZ1BFq6_Ow=twKiwV{K^R6+d{hXmGflBjema&JwjV}%s#O!cHZFzja z;oqA`SW}1Y+_#s$NTX;Hc$uM06Ah6-;;4pDLbQ?)EJ|_z=iLkh2WKha$m0#F-|$g6 z-}dmld`2e>g1=s}!s+rlP%+J`7&42H)ot|Fj6OVhk|Z0nTDzgSZTM|$^qBC#DnVGS zURsMYlZ=~q9z8g$*i-Edlt$^kOqd;bY$trz_@Ucz)uC&H?v1?@S)$cRhAd% z@^$5A{daqlh&a;l;*Il=b5UNwg0|~cD5mALqX5K4lzk)s-04XPpwXFjm>zq!KpmqP zwG25K(r z)yC?7(Njoj41)$nxbu-6hwWytIau^k@>g0@xpq@9uWS1U0;b<2WeTEVx=2COub@>8 zlN%+&*3&DIk9de2G)>7-7vT;dRmk=mo$^kIERW3~%nFhfUr(boV8A?riUn(EOaj)2 zmi4i>%NxBLA`| z)RTYxV{r?N{q%D8Wfw4k_fG*y()2<)3kk*lx>49ih zig&E|mz3zgd%-*Z_qYxE%9q~%KdmJ)eLG5pGFajSNLZUVZPOuv6$1Q!c-JD zNy|^R>ceOyi&`y?9o_m~Z?1B<>D0~17+mgN0{6c&!iiVwqRH5^>Z1lYuo+Vld83Ck z0cjQg3I`otoe5gh-w@p)?M9AOTG%Tgnc+Q0e7Ib*MW%0UUB}!+*N;nc!Pjl_ww%Xw zlc}8V;jnCFOit?SdjY0C^EmERc^zPPW?MlT|5D}Ij}|HuxcDezMbXDF7VPv8X%qRrnqd-3Io=8oL$>a6v!Ry0(+KH(9^7LjzYy*Yg zn-NTd32I5VOBKJjXbUr9O4CC-8J>Y=MN{Wt4{EF&T`Wki>6)(+ZRCftF1X3x@J%6( zfKcuswuuB{D3F6i6RE6@qi>g7+`lQtxuF@3OSVE**9Ocib8z*Q?S#yztb}j#iW_eX z=$+?jbk6zl7a!@DF5xPrA?*@sCgT?P$2j$Lsa!e<dxC(IlONPW0$HD-L32dv)wP?a}!1Q8dg?_J+s2Tt8YWGT-gX?AZD}|?0Myf z&!yvjI_RXno@U>c{U|OLK};(A@gOC%xzW&sZY%RWY&)rPDq1wmFpx8Bzd5J6c&$=>oQ(Xx~u3l)L7CZ>-agV*b ziz}E@qcINbRcN{D$sb%HRf4k@yQ-;R2}>2}dJq0q(&XQ#K*4z{@)dpNMPz7B-3R+Z zIxWZ50?sdd`gO!I5Sm61@ZZhva1It z#0!hd*SUErPjx_S#^>PomOY{ClP7FMXzE$P7yI1RYgO8(4^wMCg54x;3aljsxM8(g zDqF8FyT-|wg^T&>I|!I^(}i^*IzsM|RBhsT7*^$vl=`Iq=TCfD25~Mq8_m;azUrwB5?TNj9eM3a$#(Ylu&txDn>ospO z#M48p-i0vKmxGSetbDi<0HnApXy7JZE)n)8OE9Pbrjn7(MR^>R0^7(cO zBI+Hh(m%U?{fM|lgPBkaehSxI3qC{}G0mh05nHT>prVJy<4%%eEZOv9%>Ku&WdN3O&j*0;{Z-1&slrTk1{7pA^EjqdHFOXhR%fli zyz$bai3n}rT)VpzR2D=dv_4oaBi4wm=OCnVF(ykXvU=%04~#lbL^whs-pWZEisW!c zvhB4!$My5(h?qJvfuR#6*iY48jy9KriP!N|G+reIaG|mkkjs0e4;r#ltQ@$0OUm)D+%sPb6ET_mXDfuotkd1YmEJzx{FA- zRvtks;Z(Xq%L{3>g7artt94iw30fNkmyon5d#@>A$ zKB!Gr7+YO(d4LL5#9I+b0%O+ya0U46<#0@3yEsY0Kfoap)UOo9 z4kD{x*N`!+Agzc7Y|A)?y|$=j_*hLrRNuQ!&CQJzE-#SW6hN?_!k0%iWnrEyaS(6W zm|)sdPM^<0mS=GiF`y~x%Zq?X_%uiqL%6jNKBU=DnzE8x!bZzw+|nipV(XQwRrePX z^;s?Ej73uQ%{m()TW-dZXq5Fewor>ZY2gMXsS8pb$nD)@Dhlg3&e7a)KT37MnMR*> z-6KLbBD{z$B@gyg^Ek~a>){~u4g?*l7rv1wz2mJ3{o})=V>@!zih0s$^)VA{CxeuL z5qt!|e9j0p(|w}H4JGPb{YRKB&NFt}UMtj;5>L?cj zO$C=G!~iD|#Y9F;ZdIga!;4@%`@o@&b-19L0RpgYa~;BV)Sdo^Oz)5BEJ3s~d}3BI zPxG3+@eQ^(?IWM^@-Z@_&7RHF!EV9QR}N8E?#yfQ#6FF(dc0I3vG?;;sqH)g%E!9= zIe3Goee=LO%gFGfUC6B<)}m4=1edKHUC+#{JD#5~&farktVYr?EQiIK%Sy0xpUzuI zec0{&*Dzq5GA?%2+@9v)1@UNUw*Ae8a-(o+93C0NJMzS%+^HbFDksIniLisEA;Xf;CT9c#1~`H8>E&f zg{+W2X7hy>^-`5B;c!SUcg1`)Qi}XmAeBJ(qEmZ;R;ZX{`JME$Y=)#PP6%K4r%JXW zuR6}ccA%+S;|nlGL6C~Xo~R1yU2%4)S+C|wXJPVz$d)*RTdq57tcshNUZM`a)b{#D zi1eAyku_pg7T07~eo|4KR4NAkk_v9XGc+Tz_8I{&6yEamykE%qL0jpTClCY}P`!P7 z%{OO!HdCq~Q{yX+>L^*@H9U3+X51yS@w@wPB5_YKFKJl;WRO@2AK&D>PibJR(-PVC z7`Q?8r%U)K*9q(}vy>j8bo)Y<6wuHb@@JD{B9jd=`VMV6Ii|#xUQCCH3)koeqrcH5 z0#SLFt{t`=fpwP7^STnN2}ut4?qMfc$~LpAjjt7TNm)N;GnR0r@G#krZ{#D6dkZbw zH$A%+bK(91qS{>wNF8Ab+DaRzO?e(`bXw>B+R=z0aCllA8qO+$I^k;l z+p8*~a3M1#CmP2x+xl{E?Cjai6F7}r_S+2RtDg*1z`|~@;M1{lOX&o9oU0QZsYgMn zi_(5C*LnU^bMQcuLdhfv@t@ts03m)pJ+`<9y6BIq@CrF7{h8hw>$F|PF8cCo$mxCZ zPJ47*6k9F+GmDAWgr5P)WaAnj#j*``zz#qzNIYdo&6eF>T!QN_pZuIx6!mWOKu!d@ z+jqN;jwBdBbGpT<-&Rw>Oq*C-{G)r#EzZY9cIW+WO%?*wXCfOO{UGe&wITsonf-CP z%ZoTuGUcJM;rDiMoPG>L_MK~ znA~|U)bdy&;KuzsUg+qB`^|>`C|EgrEii|#YO2Xz|{|@IxznAwaVBJd#(5__h=b+b!9tc<;`?IY z);vFTS_IpkA@E+kcOSKv09UL+l6FHJ^M&8`C-={N?30LcZ{TsrjXam9b}Hv06)wmx zBy*swTQ4MPhqL^e3$)SLIE1R%QJoG?`h83y*IH{%%36e0UoEkZ8Wf!v9W6yVvl_Qd zXfMb^rm9cB?X^?XF=b#TXyM%E^HCB1BqOg;(Ch$k?|&=JBQ{Q6y7D#Z$wg4z9hzlP zjfIuVWVS0);XK`BdoWwe#|^0*Njt)GtJrWYj)yJpAUck;08H|BoemO8{i?1cE*!ZU zx!)QVg0jU5W>OF|VZ^ReP^9U@I%$~|nNBdPNziNu4#_7m)}ChO8%pBh zY<4aR3T;g)?bs&T>lmj4LDFaN#YGK2;DAPrJiLpp+u2YOzhO5wwn$F4T>Q;sK8m4$ z8dcDC2aChPd0!bdI`hyfRDuKrI=~+BnK^UHX1rmfp7pcIC1se+uun=7M}DMMQ7K5`M+pRNM&Y?tqr3#*L}r*aXClqm<1O+UZUho;o=wI zl{0@Kr$UDTZ~D*f5rm(SA8W82&_@#;6MZOeendrwf?Csnn?Z5Uh&l_MgwU=M81>W8 z5x^$8iQc>6qeQZV^E;3%j+M@ay8(F60sK28aN`cFx`c;fM~O~%y&DzBH^i}6H{`5c z=^2va+vZ1S!x6j4(i#1 zEq1i0K9VCPSQ=RR*vmS%zFuvL1(*E}GMOUsYjrAd-(;BQJY)An&oHkNkJXnD7A3=_ z@1&_rd19LP>f{|4Ghya9imaYTMkCumVhlU33DQa*F&Zv-cFTtuz{ZJtx8#Gjhg-{F z2DbiWF(DH!xv22Sq85Ai))@0Gx=o3n-o}4#jfo!Q*sY$B!*k-Kp34bOv_CEok%>Wg zSZ<8uTM<#7e=&pf(18y$jZJigjBhry@r|usH=NiATyZS~5RY3@Wg`~xB#I>uODASL;vV7{hyn@+23hF*DWHJy%_|xA62XWh z`Q)~gPDNdOP;4tHczt5h_PdU;!#A5=hSw!(F*@7%*_^IXWNM&J->uXh)e`i(Qt&D@yM4?Ad-zKqI;QJA}v{LC&xhTk8n`;_DOihpYs`U?7eDl@G zRKsb=Y?ANhp^(^V8QqEmw&v^0Z2GN}fe8lJQu>U`_Id9=uon4WSmXaLD^T!di2pCF z8>`>x-*jfZWETRKRmmVv!486k-xpL(K#d0m;0yiPIcCS1EG8HxJUHgV%*U zj=K#$uu|1SUZOM%oT2f%*BaHi2U9LtZ|GQ(44BUDEE#X~an)YJq1`v}-4sg%G8``o zL@ypcfCxx(eH6xa9=yYPWy(HsQWYDm%`ZW@GpXU)@3wV>A{CAP>09`;ZFK*JbY3re z;wZWyiG-C=_9cV5LL+T9NDh4JTkfv+#}HhLzF~y$K(Jc|1d#jAc)hfpWU|kjF|)Aa zVHOMjv1)qxP+gQ@EdiUid}jReq5Pr|FbuJ>51l&43DI=!uK-~PfOZ|?W21>%OKK>~ zO6b2WeuvdsTqd_8K2G(I-k{qoj6pg*mL8#p zCcmL+Q|Z)eFe~d46?YQBq+)7J{)SIMta{nlA(h6e<_vs9(PvpW6t*+k%k zc-%Z|d}2q*sy{E5AN?XzPE{)Yi4W|gBbT8P`JKlP%LTT^=b&4-Jqy(dk6B0_q;Tr} z_}E(E547ES@&Vf`*@iPRwZ$3-=gvA6l{CSjU2>S8`o|yI7}a~2hf%}Iw-A@%yYFpU zk(-rgQML>MUd?DN6T(m+0r4O7nEfw$ey1sbNcb`={}(-I9_zVc&TvJM#qnCgD6(eJ z7lYR_Ptd;qQfW}04UB#zH4jZ$-th@Fd!tF~;D1nRJJCqdGu3u=}hB$k7I19!~M= zcQyzLQw~3i&+v1rH4y*U@3)#QrRQ5*t{}1o?NLm^m8sZ(x^+gk*~S7l~J3rThL6obTyV+3$%cP0{f8K03`wU)&fLpD!|)+d8O#6c;M<- z9}Ys#V@H=J@K<3A+#@ zk4PA32sAG{z8_;{`WO|kfDlV(+q5Z0E|pRBA*LdF_-{R`ppgPzGTDX~Vrjw_J&-;> zzwc#1N((ymOk9w31AIBb3L1Z?+df~dko{1owcEfpuT(k_?0M@*^yyoNJmTcWMQm8V zS%Qx)G*kwDgCOOqtSev3{J|${t0mq=3S++og>g(pMH38LC)@5{V=N`z` z6tjY2Vak>Y(Mwa$DHWB575eUcV2CBcZ)d8c^g>aRvy!V-V!UI@$nsYi=B8+?5RI1x z{y&iu@xPJtn|Fa^_%dSuJ95B(fCD~6j~FqV$GU^9a4St`37Ak^M)gQUt=t3f0&?x%3(?FkBS z?xOhx>N5KOf=v0@Mf`;_mnf4FV1NWSpoEGc;9QEcpLj^Lpw@|6M%C`F^C}w!y*YS* zv{~OS?A^>#j1#De&%U0921QjEL|Ius(RWcXWDe-Qs#oll>7!8%q$)9;er(R#3TS4d z?-LSX9MSi5G^|(G6YXpFj{|xTn-Tis3RrCsQbXe`bV}6^k{tAk_UPk_Ac{B8JIyE= zMJ$SvtbMg4s_4J%KHorLrpQe)xrqppF%X1DmQ=v!GUEREL6yJx_!n&oo^TV5(-Vx| z{V$q``SlVIm)yNsOVCJXPQkjB|O*tZTIGIhY_$||DbudMLyz6lbqjjW)q5Pr~J zbLg|9NCduNG%nzs6Da&BmMjI;t;U{YE#@^VcCb2sqZj%PF~y3PR_cw9HUlfugH zGN;TxZavpvVZEfR$smT5tYoSX#(P^#=abW&t}zO)Y$2o+KFcuM6Rpx?fmza_Y5j#^ zB4TTKHK-2nE<<$$VN(#2ppPA8OKP$}$G*29KVcpG)n;qaltaFAnKk+)dg-u16H0u| zK)4HQ>i_^m|8KpY zGK2p;(Epbj(Qn53ub&b#|2Gf;gZMHQnIUNYOZFV#&!u(kjdX4t!wY$6zM861**;1b zS3B4MtbxPz@Yzn29+=8 z{{M9!AZ#$2F>RgJLN4&ifBP&O;eVZo|DOj)|34?v@TH^wIX-`f_B(ok!@IOh0GZGxJb-F!uZ1P(otvI1(=Y^(}~;yE-O2Gw=bS8Ig7a zZyx9qEbM0KNJgP@2^8sBo*WJklp*|76B1#@OanfQeMnksVr8n#DIf{tJ7PYpPq&on#C@0WyA#1aQjV+ozl>X0T8 zn#+9Qq7mDl^pOgHmZi7s-AJ{IUFP1e_w(4 z1uE1e*74tVF=TUL4uA3vv8^81XrgR`GBMWfDQho^=GlO}Of>WMH@tThkw#xsS`M(~d z@TIH$A2YK`D%gS>rHu7AuV@srV8!q>1W_JSyLATp&yh?Td!vMl1@G>j+Yo z1XIdL3Cwq~68YUYHp=tSq9q+3sbBk@@7t2=x+$JL1^@^XB6sKg{WLTaDPG*m-d6Ja zTi^deyYIO!0C)uI)bA@0(=(6X|B*4pMxp)yJZl4jJD7Q+E$Xg-FM-)XI`G4lt&xJO zPSxR0Lo+>?xqvqSJRRkG0l0Wx8(okNQ_m+}#8Od*WcnaWdxojC%5b=0*@v9Mx+M0G z*Cmn5yKCFiidJ;VD;}Y`&Nv5$3a*Z8&pD^!CBv>;zN9%iW=?l^C#a;i&_AxrVLrE&Z&n|c(J zla(~%A25QvsXKXHa9Osjzs$01$5d<+xyncHX@03Kc#JMS)X1Q>QMLlU&Log_F;cgN zT8tdX(u*sqvR2q~+m>FkhVdbXSc+_0kLGy>XrCCxiD3!at*TZqx(6}443PcbHpOh# zEGM2n8Us$wmwecQK|&NBk=N+Xfzc^~v;Zo+sw;RU0iLkNuMEI&FM#JX>U|ApNZwJ9g4fV1F`Tw9oDNgck^BK1mKuoRp?v z&I#U#^fUCx$v${>%i!loS)J6=-J9bGURWls z*__y_Iz;8M**yOGgfayeW&miGBj`<_+RkBYM_tX*NEGFxk~ zBFOa*L#n}A9#7oj>r5X8R-?5?t-z&gKLW>2&ksNGmZ%^$2RM#Xqh0cE@Y*$L4Z^b| z-Kvhrullr#_`(EFo5oIy=iOkUYBi-xHYxH#Ng01SAIg@d=3?!SBP+42*sv+aW&_sR zf`|{EO7<*$_%TkKTk1_DoORnHl4C4X@%ta*naqQgf!p6H6I9yNv$J%Ay4Ze@c;9^< zx0PyGs(qO_oKzihQJ`ji*-OHQ(J=!dRe3%%~QcCe@c??yM0ocl9+Ov(x@SBc3LAt2_HKPA=P&Z2;W%KDy?Iv6DENEy{sID zEl5{Up`OANVDw2SovB#dXba`|%gq>48;Cd8=D6#nNTpO>AOjJZYVZ%2P1jwU!#L9O z#o_j^kKMsw^Dkz;dgfGl`q_%YM)>Wi9W79%ju6=$sx(>eS$Rp=P8BM+S_VGus=t638d$IKMFZ_K{EG zhu3MN3k#q!ig2v4h~K%I)kmVU#SC@6Z^2&{ka2 z1>!R}<;4<9#F}L1LNPlhW2dqnXIU8eFgl{?;ThItA}jRD&ls_UAS(zkkr7JKc-6C=%`v=N?%mqLPnOV)Z1h__-&Z9~ZB82bJ5Uqv}pPuw!1BEF%G8 zK;;ZHwq?$V}Iy(pIuPO34AvTpM2#E0UB91ztf^_ZXwbpyNJDP3a1j1ytr)!6 z;N8_{@y*PaE0gvW`A&$oZUVpqakLb#FWZU-4R5X0JwO@+h8(k>=no~++phX}Mr>P1 z{0h9P0+qz1Hq1)!y2zY3RVyqu`9ko2&q@j32#!(vjY6=8r6;oN*k(3S5Ce>W`aELw z5EV*%HLxemLc;D{)A<$fdypxl7i9r0Mw&P`oztm*pBkKTVdtxjn#zk-z!;u?2xEM5 zzV2x%rr!523gp`8cqwAKzg+oT(L5S1%dqU^(zN6i!d#!4#4R7+>xLu=%$&S<@2IfHg5%=$) zKHma_VQ7|8zT&Pz3Ul^V-J_3C8uF1oIH58(Rf^QJ{}c5dDWIK^bwsR>;4~2CEJByq zxr{$%glSGAf#1l_<(&G9$R8w4g|PJE{JsRYKqr(F2Z?8NME*J)y?pK+hm zn}HgKoJs}(w%GJk7&M*@7zeWjkV3^&Pr&JO{Og7bcqVCdtp5hABX!Dh0eGilmx)^u zX10^Sq>JaByXC~BH7iTkTUML&%1%o zn$H=rjHZN?87pM|-TW~%aGrJ^5hKd|$#*TM%y=u^G*UMQ5*{|$muiHj_QEz2xkdfp zq+a<0WJoJAeFm6bLz}?O`N`yYn4va>)w=#V*u6U>UKUlTJ`J;N9LMsq+VC2|nuJHE zhT8wStpDJeZKd;aPeyYVkGg`|zNJKnbhTWFg51u*i!}g~#PCQmc z$t<*K_|TN&o1N89`x7IEjL`)WWw6q20uB>#Kl*Wi|I;I1{y|$48gra|*xnYr1KTJ>r z+0WyM-t!fSn}6c4Z1=fzX8C4&ZhY~VvROyf_r;R`XscF$9-0^+-AX<=0SnZrPKc`J z0&l(1!twBoU(#I-ZKDLqthZ8M@+<^|@zWEW-ws)#AfWC#>3?dvo zWf79scA+Y`qGKF>Bjw>V;MtOxg!2j5A;C&` z*ixJFjiRrD)ot$1T*5$J1-<)2vlci%vd?u^3xi_y{Gm)Z6m4Lv9C3!N>;$D)1~21T%XX zO2n>r%;zlkCCzS}*>rC&eU>5ftYRst8aM%OXaUjwNE5ZtlKu?u=pB)IRCkh;4y|aHQu;C9610&uxS-I8f^4<(Wvf zh~3APxgR#)1&@s1%rJ9NQH=0vfW%3Ir6ePNvTahn{wG#~Z4Pi`XI69pn|Xvfu<*S{ zm$Fe8vo-$%B`Z6cL2g6ZCI-If&(?XZ!+-c#Db{L4Lo{_gj)us8SkEh%wCD}{vNDCa zsAgF%kFiZ3d-gEEgiu~F#LG4;w8lKPDvSzH`McyvRSnG>Is9AArO^z662Ea+4w4{_ zxI@C2*b@5G&&1vfF=OX6Xc^VU=pKF5jew%~y{V6k``B2~5!NH>|V zr8cMtxGz9KmzW#sIsb?OUnz za#k-a-o4T2v%Iv*rkHlswULMzYIb?&-h_#{hPpHpiqn`#;JRXQtlZv7pc{@=NuwM~ zhOBAL_#Au0Os4@UI>*xV*{>S1i1SRq#*fS!g743k(w*s>^-vp=Cj0Ek%1!S{V7Bm$ z#4;Klp(uf~hvmd8{m^0_ygQw|219=OMeH+O4Ew(v`Dn&*sxKl%vp-^ z?UbP0ijix1n>~{y6PqBX?rcPFqihohG6s_A=|KYuw2-%_a50y!K*Zyv&yBFYED2mU zVe^$Bfy@DDOhTD)wSmo*Gk7zAPGX#buuB|nSM!d;p}FL*(* zyF}|21U7DMlwi_kybY%$YHKE%l#sMsajQY2E!N|6XRWE~d&dg_>+s&>!ZcU;*bC85 z=!T5hgkoCyq6GKiww}S4)!)J;9CRyZ7u}nbkP3^7g8y_v8XX{@`-eUEAeWkBemE)n=uDL!WfG`!vS zi~@;EKeK-V5ELhyY@qzr$dcgTb2zLy_(Q@Mwvg$@A!FlSkk!jYa>55Cx*HKU1zvoq z!Szu6LjpIv)F%i97G)lTcLrcU%kTUmA!@vPfzMY#+8GoeSUxTKPQ9PnJ>Z$G5XCq? zuvYFPu`{GqcLVbwXY!8$q#W#8Nx+@zt&JK#rZP&#W%B>c&EcF8!R@ZSx0(@hTE}8@ zZyCd9BrQTa8b)ejH=mA#sO~itxuJSg%k^F6>_$KB`R1DCj088aGd9hoM0UK6>SKr7 zShh@oJerlTo4D5O`ICL`-idUjmql>=`D0vQ#doSxOnqE;&}BP4eEMh`;xP3dXQ4ce zj8|qmvU#l3xdr)b$LP+e9AmzHrqi_3BjmzcAoWqWu>_QNyNg>PFW>G56?aoO>-c|^ z`$*w8UDj-HT$C|;nb1ji^N@za2f9PCwFKU2g%FR{l;$C&ER~os1Jn)Pvx9YrS6U2u zMyBoO-SXc;z9xef&`CZSL3-eM!mB2NaoN-4(w|dAsIS#*QvK3Sk~+Q$>#ZA_VP4rH z_3APwS?CcZVVrK)xEW{E3x;1o{mKs9dOMaMW~D~|v!yoH9OGl#J#H`uTw0*L@d@RD z9BPYXi#R0%NFIA93p8@2qZkIN@WDkO?W))2T`V$aD#u0KfotzkCwqNhT8YKeXL;~X z6VNpb=8VX@(rI3hn;0Nfzb9LeEPYRO@d@?=y`K(rkf?+AjnM*`09H_B zGyb(wlz=dzB;!B?-SGSU!Rz*m9R>M8a*|ZLEoE9ruErCYnQk=ZzW{#k$^oWFe_^iS zQjHiHb@~yrq^39)0DGy11}?F-#%kfU;C9&@R0J?BeZt2}M%05pZt^A$O_i1;%pf_S z*ko$W5q%MTjBDj#^HpA&O8m%R$=oo^U7eXkp(Hv5Jxtf8# z%1zp!u8U08jY9QnJ5xdNzJR8$;VRZnJEwv0rMqv-lLeTz5d$|Q>cfUz8izlXW>xRA zJ*`tR=$M57rQL%U|4a`c@{R0WB<`&cF#l|{GW_EaJV#gf2EH8R_!GOf-r43XC}jA6 z&)Jxob>DrITTZ@~RL@I}9*#!VPY~*!nIM4wBlVViZ+tbs8oZu+0H*vU?^MfYLi+5w7 zt$@`GI@h`vxqbe`b&a|$K0IXg+0}i5L<}y4&DpeXogGyFDvsjWqZ$i*%PY#jG|RL` z2Kw8r5Bi^Lh5hNw<;c4W{;u#_mhyUBqM*vESnG}lR`>MC&1Q&! z{hIrD4c0uM{cPuWQG$E8p{)%Fk_d!Kbm}dBIOyC7`VZ}1q@AcFb7rnwn!qLD@%O6i zfv}y<9lAeo*5P?25=hEi$^f7|183j$+EtOtdFo~ju!J<&W-L(?nKa z4v>v2y^O28>cSjJMH!Q7vm0u;2cfz~@f}7X@~P?N)uBGIsJh}(SZUD$0{Gu2?-l7W zi7_O|7b7sRoz`I_)P?p?UBtv+amum9{~joBP0gMbgM;8Wp+YpchnOuyH* zf=5ZKjK;l>Q>XM*uNA(0qd$J<4j5kNCW(s^`@ioQ*l})aRqT{H!mg7(JHM#L4|Ubr zmo!q3IYXrP>foV_Tq3X$vVmZ65QW;I;E=xyno7@Kn>&V-=N#3 z>tdr*c3D-~WAoQ1VAbd`u&CmvDNO5?VJdC!91amruxQhUDy4@P6Qug*+eTj(0eI~j zpO0IqX+>3XKVeWtMY1S^O)UQDW-x&Yhow;JT0b$$aei-GtsqXgyp55HdcOL;G!cM} zq3!creI#(ssI*IEQR^-x0&b1Py(GzG&L`q|)O+9rNngG{&q!j9%FFE9$`Go;E_`Ol z@6CuC`t;eGQF%nZTk;rtTTX9jewp%|bA+!Essz2ZXS4+ixK6Es~)&5cMM_RTSDBDkOFQt+XgSH?&E$toE&xCEM>Ng3NiF|7VTXCmC)0+hhIDM zjB;G1^=VD>W$>DhdZU%bf$0S8BU5?sq*QWys#?JYYZT7cw*V$(0HkNx((vHmA7!FD z*iHf~^quFRx~+d)FhND8Uy2UJ$bXxNjxshbE+{AHHuB*6<&ZdWmm^YoqVn@qN)375 z1e`6+Cc5jFVtgb*X_!h!_RvKy*QYAkVVBWo9zJ#BQuFqlPWuT2s8Y zE`mlJz!}pzOumd;dV9N(RN(pH@XDFVV6F5*=9Bw_S;Ro%$Gh`%T_lMQ!qQ&SqRshh9yl!hKd`z~@45 z2SXAV@}>2aV!^KNZfCrQPzX+=2gM@A;%BBnMXz=Y*`qT+^v*2x- zuo=Gg@KDo`WMVS0Oze1v*}~K{_jR0JVMA8&<}E`17um}gHo;12pym3w=tYRM-8#j9bs@wQt)o*oKHsy3y z-g|KrFRCX?(A%zr`({x#?Y5F z`Gpejva&seD!2u^o$^-E8Y%SN-;VL5l31%R83+DidPg{$@BQ>WY-n59c)rFrkifML21OEm5 zvs5G07$`3iaF*mlP-8ggzjzZJtNGep6j5DHucqkI&hZ6pI;-jnf_MT5J~@$#Q6Z=89a1_F6fAe9hvFEtnGD2YCdE1Gr_R0=EFME39AV&n{BT_7GcSl%nNx!PTf z3~NaZr=9z0C!g&c>`oO-aQ;)ckDc|b!uv|l$>f_Y!2_~f|MCG5%(v&V&zg^9l-^GalU7i8gVj~KK3}pd z-ZgQUM>J6c{?_M4CznDxY2m@*3%3|6~S}3l9G$91e#6(;X}gkAN&R zLz!LMJYQqn9t9J%|L;M3T|mU~qgYg(@fwD=ju}O}&pj z;5=rQkK&Um3v++`IFU0i9vSz`*B2ksDREtCza#7?39Meimg=!dY=uJ<+IZzWesU-R zqFr%JoN(`t;Tx@NI<}9>oou6!cP*R)Ohi-XsU}%Uqy;dFTs0y^wQ1eksm4#7rblf! zos?akA;oL2)*PIkk}b?eSDF#&@GUZl@NU$m8$id(`FqP?7}w0h+1H_U%wy2mLPtDI z1~yY7pKtO$8>kIpjJ&hKlq5mp{)cw;tiiP+wTU1Q=3oIAO6F(wKDXmC+*J&%!QpQg zF?s{4jULauhwz4{#}aXPDp}wdQxxDPRXn^2Nu9NsLJt)Kw~xiEgCVd!IvEt929DJB zdNP^vksS|@I3Df9b@alKw{|Qm;#{0|84K8iB31~**j+aDh*Hg1jeX)|vFBsR$!=q@ z=4*#`b)hSzV|Mfb+ z(uHu_WySq_iRENYdxs8Dv0tNcBSW=~M5hMh2#WOhM*<-Gc)`;Zu$fWV4AMQx#@nUN z2B+mY{}0=Qtt}4|Pe5iPes|_4P^IEwaKoiS-=60_ndJ|=BQ29TKm4Tpck zn#M!LFd5{LTa>aAW|JW z{FXy#VY~}QHTfPzuGoA~f=&H;an$Ygy+_1hi3KwtAr@@_KaD8#*Sr&{4Ktt#G#E;D zYq&XV;Fe9BK4<{h=Q z0gtb{)avz&DR#?lM3qTI%ILd1cRqO+9p6)BQQS)KkCvm~nQ7VL7+`1b@ z%G{=*Ohd%!SdO8{K-~3`P$}Zr+BhUiK;Q!BmNHUN2Xm7r!@gU&3z~#{JFEx3E6y4j7TPW-ZK#UI+%63R7L& zb{dI<_v9r$MicCX0~9ow$FXY1vR65qVc;p@#3~uMT@VKF z`C}E^Z?9+oxgTa%kBqWZXjgeBG8UI(b%J_=)w#eZJoQa_^2e&gY_!~V{X4WdMjf~g zvA~*BC|!547j{QF{BBO|UvAF!aLIvN(Ol3qp`2H`F?L?!-Uy1$%)dES37PeXDmS&-9}1wVv;LS>RvPBy7uH&PQk61@n<%H8}#+NZXxCn=ly6JnB6rE%Xdp z#3X+C%pH3rVa@g+CS_XfY$+qIW_WL17#3`!R1$CvM^p91LK-#hA1S~tln_;SdSE+o zk6@DLV&4TwwrmErrcqK=C@Hg_=8NNS*4b*vDyz(N5)uX(bnv5TSzuhl8`s5*I6q;n ziyyF>5K2k5&2*a`H-D)QnF9y2!RQUl$vk52VBQ%*nL|wvL)-1e;#yA6Z+4qE!5~$9 zm%hfGtA3NI6W{nM@2~or$sAzQE1Qlc<;9i;s-{j#kv13OSp@whSL%(^eh>#9w9xW#fKB&IqQNyVM=kzP5IQ__SGGqPs%vBcY67Gs^%l~N74O?)n#5S zyX32d;t|}X?t>IxcAC|Pt!C`(5XHKh{sFFY=#5Za7{k?EA1?MAiJLdOv!TY$Z96Q0n%KjI1HjqXxul5!wcx(vCpNcJO(?l{N@2P)$ zo%;To2`*mqP0}z+`>TfFu@i(MFpA!-CrW0z&J5AF|4ns#cLBQ_KQi{K86OU7?+7^0 z1L`E%T9Pg48GJ*fNJDTJxvYL7=qwYw}%e-grH%#I;JeG95*yDkw^bH()r+*qy<)z zx={mB&|>HGki{nv1QVu?FU(54%hLa*N=Z$A{y{2BDXpchn{qA4Vh^xPBL9RrgoYN; zC>TG$woJ?tGOf>w{se(8A?`v6rf-7{3|p{Q-k@ppX!x0Yl02VRlA}rjTwjTkd6(M8 zEDAbS!OwR`!RJE`48Q>YXB)8Xzin?4VR-!ok+RUD99f7Xg!i5|K>V$O=)-3vL$D05 zugzx^p|pgyG2Msyuu@DR!#u0+!qAVq(hBYe93Te*f6vh1=Wb00y{*unz*e`m)e{Hd ztY&9|Wwl@E3IJAS?0&I6^}RUY7iUcYPy+GSEkj!P5&2DEC)_nrPpf(O4nHBH+QPD! z6inOcH5BHw0`LBxfGYNKHw|lWKyaK)p+xC7SpQL>#66++VDB6_8* z@DP<>rmUp^SmKdsrBf&XAo(NNZK66@c5KOWYh_Pz*59+_qzc>2&b3T)S22s#%m_z} zVG<`Zt@U=K{6-nLUkzu0+5In&B_Uo4-N^aptpbDmZ5`p-gyo&GWM_{rTeYanP3x-& zO>nF6K^eM2u5e!kSGTScX1!oM_lRwJ_e3>{ApcBx)(Dy-3x)#-G&;WCWCfj-R(-Lt z4EYeG4@1f)@QS|dbcY82t$t2dkBKfd2X#*;)JUP%Y98e`z7!`ZUh)i?X+L8Kh*xVE zq*~pbkdpY4-`$1>2oD5wFeHpJ{}4y))T1>}pgHY98}ssjtp6iU6+HI$soPCpeI6)4_88hX`o)u*+<=xJ(D7MvvbQl6eJ%0j7>1I%AfyETA*93N zLO7%OwhVDh^TLxY6RunBUQb%#gkyzbQ{nzQ7F|;w!qKHIK$>K2;ukzO>&3pn{$AlhMjcyfA=pM)|-czQv3tM0~FA{pSA6FaE zl0=dVIm=VOn1!vb0qnb=%mGJDJG(Ar0!cM0-h9q)bpK1OV59TCY{~NoTz@x^tS|y;QpxTakXYvaK4F#cAZgM)oFB3juHC zFT+7+J99`q{+WBVjXPF4x0R%JAPNUk?RUdDXcPbtE1x`_u;mVF%Ce#8d)(joWupwg zRR|4Ax|GNh=^m}k%gOVhY5rH==3YbW=r1inZ z1S}BN|Dz5S=x1EjtVNm&qlMiR4r3s#fi!8bwfzh_%uSIlzydbmt)_+X?t)CtuS$ex zpC%9eLjYx1b>BVW3}6BARm9)RN%H{(NgF}X?+i6Q1Q$g z`@(XZgn(skZoXy!(8CwF;KzAVK}vNb~8rOw95*#?XWB!ryq{{ zn!;^(DW>Fk+r#a%X>E;o=XSa*`yKMg!v zya7vhv*C@)wmK1^IF@-~Sz%+pHa8<;owv;W~q4w`;)z6LA@cH z*AHP)&IZiAe75eCZVZYl9HvPgn~`ma9%^Rb9)~Ibdnzaorg=(N^9r}hfOP$=O^8BS zW9ut&m4BvLU~`ABz`LjyW@^o{vsWFT=1xmH4eW!+1l$ga6Gk_}r;OIkB?_acWESjoFhvj2#z;v$gXD`+6aR4V2yot{N5-A;V_&`Ob(xt zz0jb8Dk4MWAf^c?2bgQ2?PW*$zhUp4fDP+>a?A^GTB}SAjFJle#$X>mHW@KC+Po)Hv4+(Bd5%@@Q+mT0D-&uNSw5I`$5Fh0J ziyR~7AJiN3$2srO#xSF4m~nxBOx(>abeAzT4PaIGn$an^aQCd402{gQ*99wi=8Kcl zeM{Q_F(jt6Vi$sHQuhnqg&GPK0KGZKx!>BiA@|L)$jym~24fPmBH117CLUb548{N& z#db^c8oH>KrZUtH&OZ#&Z_NH6;3L4P9nUhW<6t*N&Uz24_0#&MJ^!i#Zv1(Sl$9w> zy(=-THKa%%EKqJR=5g~E#%o3Cfxk?j!qz2A6xVJh?dwb{pE6w=MyR5hk0@Wm2MlH(lr<6Md^v4d8md3f61mq+- zn++o;SArX*oNUMo@gXq;xnS_9>J^zMbWNm7;T@tccOVd*M!9l*0m5r>gz3?r>uXg- zT@^0~?j{TTtN$A%k9-2MB;RCO6-R}fK?Y~ZYO%uH-o{pWVq)PG1TYP@5z^X%6N-q$ z37b*JnTEIHJC;3Lbs{=8(#0k_@q}^vUK_Cmg(=->-8t~Lj!3t4FskTp_&prQM;qa% zg7C1_UB8EN?E6GqM)dImK$wC9!`ol|&embjyoSu5F zHsLMS#%Fs@(&3}bT^@#ExMXdW560bZQ zq}E17`&Sl^>t)Sf)cnhE!y!^H@H(-;>nC<2;cgMWPT%|y#lP`LbVh%E9WHr}01~G2 z(ltk5Ko}|fK!qTHSTTv2#h;>KN*!=BUNn?8OU1fCT*8N~)^9sAHR69=8Y2+=ul0iG zdNNsZhf$e70?#ibXNhs)wBF11pRSk{6X}Z(=;9lE!mUhacb+hF|BLZ2kWc$6J_|2- zi?*t{H6DL(znN?0Cyntqn~nL~1|8RIygqsqhB9NoSX zXeHlGl;@hhyMKAIdC@h9u(|_rAi|J#GY&B>W^WeeGX^vwkXpumcOfO%9 zl_`pn1rN^!id!Av_NJL|l*UjKs-uy8cd-yZ97OPVoK@7P7ZWd=Q?xPA$l?jIJ%AQx z?uAKXtZog+Sy&BB)y_nm*F2)!pDXZ0&S{ZLhCY3IxyQ`KBx0tDm}cZ3XaV8!_i4!L zcr$9;lOjKs*iIzWcz^|Y9Ojg862ZaQ_TQ-33S&z(!C;r>0Aw?aYS;Cq&qVg=KI+-%$auH}T#ZZCgn@1xd{?9k$Qmb4*q@FpUm z32Q0H$|Np*GwT-XVvAr$ry}RBozZL#b2Eegme1pslIkj{Rk7gw1C{YBQ%5*HypM>O zc($Ivcl)*v67DiA{E$uFgKgZTTnLq(=`3bnS>f66;Sij3KW#DGJu%BP`-+3bzL*)SR-Yt3^ywd-vikg-wT#h8sJZ9$P#} zApo4bSAT+>C`md&DZ9qbV}lJX4sQh(feZ6E3RamLDk!vVgF&@6+#dPil`l+Z$CPx% zcB*Sylp}`gvh3SFa z*jo_|PRQu$C7JiQLw$27?zFQ2JeEYtl$$ZKT?TH{i5CCzVcd})!>?^>mY7{HM5StYWs^Pb?32*yE zfvhB2J&7-?ud??rm*u>wrfMlh1lt*IUrSm%QO*P5N2l`e|IY`P_*jFkw$ML301teZ zfuL9z%lu3|RV&E}i2vZFbeyn1ZX|(iw*bP8YDXk0gOjsRHq#VRsTo(#xo_8>ICRh#7+heG7N3J3N%UTTQYG|B@7nz$UOF0Qtopw@to zT+d<*@4qC+CTQiPdcg?9A{*1>4!>cX8TW!i__eJb`)hrSK?~{bd(av8OvJDdP4w3Q z?{_S(1T#-xlfO<0`J?RMmWLP-peq4*->1|9_`&mwQzpa=yWc=&3Wm>}i&}G!;T9v|SXq-fTWbC3*PDh}rFhu@j@nho13AnE?oa3NGF?O+ zMQ9m>)0@Y&zzW5kO_c9xovL4;hZ(DF7xOMKhv{fH@Nki$`P=Bf;;qg+A8NN9l=}_n zO2PcWT@RnxPUK)_Uhs5KW7Vy8miO6W5IbYx!5D@AOczrE&E^s}>su}Lc?giOui_D2 z;9%(E?;OtjyZ&=3Nap-t)szPDH8j2=x;U0wKLBV1z3y@Tr02iE8~n6JB+{Eu%{c0R zpcQ=vG^FNB10qq0Xf|-R{{9tx16=k_^lyGrPD?Mgo$erHZyW3*)7fZOjH*QKb1;@o zV6Fij@bRg+??16swm+QB>ruoh(cR^KqDaY^0c*q=mm)*s#dHn}6nm$#&Z?Hsiy9az zU3XP1^JMTo>@{P}Tz<@g)|1dG3J@fGh7)vo>qIx8*cZ{-RWB0O88#xlUwm3&rNT2m zXzhlD42+OQDl)OaWdbZnl#WT>d^o?Saligc)`LAmcF5K`=H^vnPL7xVW%{Z3cQ@a! ztg&-ssW>vpxg!@*uoxQ#0L#a4XaMaB)3uErm;H=H=)rS)1Ptr@13xMr@8&9?!yjyIh7f$QhsHr_*HYJzdal3M7DKJ3?=WeEi^i&5Z_6L^ z^;|-t$R<QMqE136vShBWIsFU;=UdVSs3 z16Xl)*8>b0+XWe*uVJ%80L1QkP((Ra(62!u@2Z+1mYXBd6AW|bcoGXD%!n{Y}7>_k6l|U=XrVhMd|Kr z5%!0$Irj@8NhyLaek}3yu_vVEXClY<1C~}*OQ^)g+HOQ#9Va`hv#l+WJ7N9hrr=5g z@1sAq#@@}d6CVTl$Q_|kPvHnQQ)$e7F%=RE>sarBlks0};+OT<+rb+~O4?BC`rIL)s-GSbaA@?kc+k+d4w?j-AD8;_|4a}@@E2YWNxs@U& z%6+>lCyuM9Y`mE)Oc`Pvh57d;Hty04)R$3$>e~arn&y%4W~Y` zwHGpZ_z_63w(!jS8liegzx|y2`0n8h`qho-TAXlt;PZp2(E~Y8h=aDBW0JHaDyOxK z3Yb<2l-_FbQZ_qXLE!JL-!#OVpDSI4)n;1Eof;2J074v+a;bRI=`XQED^eb!_2Fne zFzbqoj2S80!%n4TQuSVnN&47wHC{}pVzEnvv6wr&r_n$E4GL^X3>&{gB4c z%vX8XxB^H!xyMdr1au{Ns`2MLlt=6L?PtBluvHco2z5WDau&zFH&ba!Kt_tnBJbeb z!P&{Xwsa6#tPC?-BH<2z>8dkU&i}N;Qnjr9x^H3+E6BPstsYvUAs!$>ew|}OnVOnJ zMA#WoAdFfW%~3-aeo`;FbQ!%5nAaEsCZhO9^&y+ey}W4n-3(yDstb*N*_y);C{I5e z-#LAPrn^;Y@KE6L825}96hA>{Yvx)lF;*AERwuRL*I6ZBpbWBXzVf^V4@M}rcxpm9 zI>r~*Nx<&z^?skb*p~j-G6#@R%6}*#{|{XM%C4pY%f>z4y@3fXTO+wip~|qCz+Bu` zmwik*r}Y)|Nx6v^s#1(Zfr8I~;3{X6?_}JZrE=aagrxW^1o~bNqF@k*N=&|1 z>`0HCtZo&6#@7kw6)JfA>9W*mnVY`_w-EE}>UZIi|$wo=_~&TNDBd7Ie`Qilic zQ%X5U(J5%BJSYMV`23x#zu@dukEdX@J!M+r(BEr@A%ts-{Qms#P#4g0O64aw+R(Xs zaPk(EvyMlyZ`4Ivgv$-+Y zPv1IQ4g?+^DNL2q5cae3cn-^k6vriTcV}I@5+I~gWhm=Cyz3|(f3Y~9#JAQDCWyQS z`Z7`4L@rWDUeAD>VG9D@Wt+!To!6p=L)REcsq)+q5PxiYV)w%oW*>Eb;HwC|L;^w-^Se}7CL!zABZ-OafPz$v zIw7V1qL+C=e7BqL=C0N5EOk5sGTo>#-3h{O1V5n0O)^UcP0B_{kZbYuW%nzca0&O1 zRw#ep4rvy@@~38tL0cho-bw55O-K2%%|g+gdI1yVio=$=hAc9aU@S5%0l;x-#L6Nu zBn)x8#x>UW`0lT<)e}>MF+4)1{RpaCu z{O~&+zVR4=pz+_`-A$h_djg==OUt&dH#4}Ew9?`RxCWyR<7Vj!RiPfHR8HllH*i!ZB1oSgu zTM6Fk&I^RQLu)Fl8JdDJCG%*%P1d+}htmVflBVjldIKax6+)r%8(T*8)rYYUY z_OJc5`Y!mwDGsPA5xsG3SB;4BE`{rQ?Y3Z|kp|n&fSEIMc8uICQAU``m~h>DjmA8+ zpvEeEUq|(X|0fBD&3DL$Np{e==h9EJdVyp}II$+EIbPOB$vg=VjUkEsYZM58qvHzd zC^GTAi@gs4;U$R(OXhWVvX|5vQS`%?DG!-|%nlFbVP79Pjw8|l1$-|ICQTwGeg>cq z#URTvzr!lib*5!x`?+IkwUdkg+kej`Ou7v-2G}oN^@yKu%+aaTvyk>kzN!*}3$`uv@_6(nNgCeuTsP@{>a)e;BZ6@3U#4h7kNf32|LK|&#>67PT z*)9voJqp@Y%Q{AHh&H9G6fUpyji-YDb$v@|wwKE`;`Ch~_%QJd;a6cUE)|e178DhN zd2i2^_?#kJlzz%~r2b=Ve2W?%s7DVc4VO!_&M*q~z;RM_9Q$5kOm^#ct+k>1aG}l} z8*t9kWR1VYDC@F__}#n_s88d-rOVv99`d_2)9E99DnAwFo1>pAF^nWk&#}RAD^H)b zDBZ6i=sV#^)obvI@D^FeN=zI41#LIAYdu1!c-W49-+%8@@FuQWfAk5V)8eFH6Ewh~ z&DdA9(~2$L-aB*?_^-M8YruWy2+;pcapo0%t~qVr|WiHATr5pY=4hiW!DD zU1~><`dC`58=>O?VJOWHQ_C~E`_ajF_0j`=pfv}X?Gj*niNi=F5I#j2eau)x_QP#V zO;xAyhk?(kn2T6~Q?5B@PLay$+i;+@(#_XaRV+|Pd&xgZfQ$l($e8v4g6*<$$?*O@!d*RW_q$PVufw%SDFGCjx zRYKkerOn3WKaxcB5i^D-RanBxO7F?mcw53%xG;cR;&5KQ4bKG@Y#Y#%c+s$KZ=j)^ zM7@*Ejv~H^(V0aNY${k(3;&*@x~?4Ry0a${U2!*r9pusvsheIr`367Wf?1xxoYs%v z7J}(_zFzZhM0hQ)RQ-Ke`21lo?1EFQr!fE(vvAca2hSzM&Zp|ke_z{0=QObDJ-Sb79_Lc`+v?xb#NykTx2yACO3z`}S zxnX%keMzzBDp}Rr_oeCMZ2yO`IRx<_9vg{^0oELRppI-}3-e=soSr+=?UNZ~~sytvHJt5~b86^+hTD39OL@zjWe9HXHuwsr@wntDbO$L%~Fk{&@dl z(-U0{LKHGFy0r8_3~dactBdCY1_iJyzTV}Cbh)v%Q65dK-Ryad0mB+afP)M7u>U3UjDwYck6Na{xq>nvpw9SGkw(YqMpVY88DCJr zn2!^;p2dN6fiq_5ppM4N)o@8vP-Pt-(H;nc40{5(LJVB04&eP)l$q9S= zOK&Wu4M$OGmWag$DL>s7!2ej)XmQ9v606-^k1szZFM{ZpP?4)nIy(wHv#Ab~^BJXG zwz94^GEI2;wu1OvE5To!F>LeoA-G0$%yonzYk_tZYL!$8!ZTLaylLa4)XwGghlptd}c_7#5r`y#?I9k{l!+O97Oz^~81@rsl` zC7fpI7S%819W|e`bhY|}nj13@s69H2!snQJXefXR^&Y0O^-=7T*5cv{FRdMA+dwPk zi!{l0dRw6!HaaOlJs3bFow;j%~uc%Eu5vB#Q_hm zez&BF5Bfc{j_CKhy9coS;S;WSk=Jl6Yr6Y zF~w*gQ;r>t$Gy7AXPM;EoP3e#fG|Q%Awu#KV}tL+L7xHXb1M`t^1xg{puuuDCngMR z95@I)Hk2mY5rvRpKL$J$Xx)$0k*v;&Faq(&WD}aTR#!L?UIjQk^ni!JslMVdEV}#J z@J=cs{h{5@UZYGq$%RA-AVK|%OHlY5@;dQ8$^bbdfX1K2bsvn(XKCn1wQ026D`D8u z9#0_Zd>?7Llq~&9ns%gMfU;`A?V|OaA0))X9}jgr8|-g-LMJHP>mLJa#Tnxw6h{lF z+g51^<^>NX-^bc9FHw^GZ{)Po#VoboJ^V9FE4t~P7VsrZg+&6>ym`6ap3wWa*8I)! zL(0&o4NHV%NsFTt6sELn7KdUu?eGp|wCKe2N;JVaI;)%E49li(%9Ns)tR>_&MS4h| z{Z~lb?@e3P50r2(%MyicI=ukR4n@I&>x`%Td(0CdCV9zz2Z`fCByRldwl^nSm{vpdf z5fZP==7pA)flaGJgnzvPcEFCsF!oI@pjmw@9?3IX&=G?+cL z^vM?wp<2%)ZKaqEfCAoIA@Z<9QHdKX79bG%Su6_$!v5gVIxThR-A33sEYcW_p1v#I4}RN zY^e=2W~VC`y*VM5zknzpwCigtQNBL4_JjzHbI|Y#;GRiWm!*fW5^;@@+XPCQ%d@0vepmQk9CwGp_{HXIQ$_=V;oP-7tGUg^-A`Qbs7 zFF5?44yZukXBRmjkUZTXSQ|VAxuiSC=u|UL$BU&1^GRM@1|yvR84oTI-x# zQ$gc3pj<<9j$yJUx8?n^i1@PUxl$Iwd$kE8&57>8B;XO=7o@@K1QR%idx-6UYQ(OZ zEnRPAX!%2hX%&pWd|B6cb*0xlq*4TAv}{jTwXb__>qZ?hr}IBS*})B~D#?(ZRdhqr zilOekJz7V2b0}*R9$TCHaR027Yg)V3sZwdu*5Q~MfGnS{x_>E|znI>o6$)9(T#xam zj++^)vA>ZJ`iuu}`P5{vbTM;0t%6UbUwfmMv9%6Y@#$$>+QEZz7Hr1e?6 z#K?N+a6ZFn`P}s8_IOyh6R7+KnvQ=*YQRjCO8*Eppg_Xq+aZntB~M|`ikd}2Kqv5* zxson}xuaAzQ`?lK5??tr5I_C0!)KI5d|IP-HDoaj;w# zJ4f2zz&&IXy`nj(9;i_Ns)3~(R^FFT6&V7bB@H~CX;FQB?w?uLBT+gXO@+rdR5-bj#M_x^Lu|7Ge4$GxR}GY`~2?F<4jz$(Z<|E z4LVx4Pg&hEtiVg${H&n?-VP9Xg^n2Fn_Xz>a;e|Yeo1(h*iMqD!79T4WpZlvq*Yd@ z$zaE5wV9_qm!59d5an~lJAH&)DeU$dF2R=h4Nb0utK!cLW>Z!&PRU;^L-?GCV?Mwt zX3{hXn8=M|_!Ss@&-h(6W@>AR9!hwPw!3=Ah-Nr$uR-wX`rIaIL>M`p#KeG8RkQ!x{PUNFM)6}B`MkpR zk(&~s(rNOZC1Cr_JFSNI_|m+CHJ>ra3fZ8tXLjulh3kWrNj-%d4ji=o4}Sc%tELjL zt(h)KQ5EJDeXvRS?tH)Pu<4V)UrCF6)dYmD>Bhr$hX%oxKNmZ79uEp_sRTpTZ?Tai z;cbUGviQ%8mVHbH^d$Yyz37CeJq`Z%UR;Twn23$>R@+gR&0Slhpfa|ufQsA2X@ju~ zNFG879CXI8Sm$4CapmRAS*^To&8Lkv205+W)R-1f8~)mDwI}dUy*gj=2HZkZ0?L zuob-JmdSN3>C31&sK@3nMLZYHMr>1+{o8&EgY)=832GOh>Yw*WvO0?6iAueuJ4n35 zq~Y+Fn|e8;Tq9R&9z)>JSoiTFp<7wrt)%Wv=QISSpqG!F5h4?uOLe4Abi3f1vzIKY zKQYLarQ5r~4){lFui@b_8OQ{@c5CgkxdN4!N*Swhi_MZQ;&%WkoN>~wBO7L=8-7>Q z5sj6(_hWA)ng@!+evB(j{V~GA99ed6DC@qEv@UCYbmJAj$Pk+Gr1UMuf9wk#)8k$kXotrgm`(0N7y% zx)X&atf(xe{7OeltF8`t1kU7AE;dd#(;RQeO(};MsajTwY?u+Z_+w%52O3!9rH&bC zbxHH|nFN#aD+RmkgsnO=G^r>$+-xKG1`EX!<(yGU?JMrvxav_#7B<;aY2}gYu_HZU zuk5*6CTYrXg%2XUlJKt$`FONygIz}~QfWd~>N0C(PD36!p~j9Xs!vA5m>=5t%nIHj zmL?z4`joU=))1IaZ%7^iDZtq*OK7PbrrPlF2a59-(NG)_T94fvr^xZm&ehezm4s_K zxkT(E7Wq?~L#xWyveNaK*m>m7)kMG9V#viqhvfO)lC@3i$5*DLJ+C*V^-SDU|DxR|>o;Z$tNa`*WtzJA zj|i3NKm$qYN(Z<3>YCTFdFP}gSx65)i{!IgH|im)EDFmij^KkLlUb)8(O-=kzl8RO zDJUzW4_+_fQ4d&nns=O@BQ@6#WZ!>+Txcfz-s)GKlM@@12Yhj@ilYF|YCA(h2Z%UQ z1bWeq&YA6rQP0EiWesHNMH3%pn1!B_ciBmuYL5xUbK1fZRI#U--M18Zx#dx#RiR>W zbL*;Em`k_WZAu+ULjYVo<}Ri~Iy`w;o%0Otbm_N2LJ1B%!$7wHc#CMPJpViIpYJ))_xxnR@?w7H2`Ie?HeBvpr-Ncz##wyZp?*ZYM}bpM zvAk9;9~LXV-w-_HZOX$P9Mm-7?XZD`0Bi0=RXxuJk@YYX3N#$B(Jx9ZIx6Vw!FW%3 zR>KSG)ViMTZ|^2|VE*Qtd8)5U2gU9{lotZ?aW~X`T!!c}1SE|nVXA>A$Zostky1y` zmQImXz!~wIi}`tKs3@US!tK6m$%mlp*Pz&+?*cGEN^xfpM@aOzlNZX zZE@=3+Qp@6%G(5|C53rOO%~x*1&H0XrZtd7>O&R&<3vXd-W+lF5(0pi0MWee!PwAz zn6!toya~~>?CvY1xYCKO?XpcA77pc8cFzJU;(iCe`QF%^FP^RZT={Q6M^?RF_B=ySM*MHM zB3|+z8!QP@=K)8we8;p+bl{+Ti~NRkrpfHWA-0rDXa3yc zgjnZn5wj}Zue=FN1sdaXE^*hIpsXN)N~I~v2~QoBLw`+@h6*_o%qS50_wJ&t0@j$| zMrR*XDV-V077{3aEb{c8i4#e~`K83t-L0gGPLqNLS7zJF`Av3(V{t6C75pLf9NqQA zTV$%P-_}~}apm4`$?Y|>!=E}!Ntp>Rp?o7=o`1UiAr5|4FY?uaJ`sW>rl-1+DNAPQ z$nX#7=k_aS4%S2pd!d#tzQ7mJ&mBA-d5fKEQ4NghB*vRRD{;DL4mM>?=&76jrVTu- z&c;YBiU=erJ5T0bb$>QI(JR9q1Ug{HERZ20IN9v1BJ;^cxRL=USmII(6 zJ>>*ja&nmoH%p5{OL@IIem9tt?Y({-hFb(XOCLIg0cAuke`{Y`yvtkVsf`yUIZkg+1@^B=F!n5N(6bfxRVU)-uqyhwM-D1kL4h60 z_tqb!V^eP+u&vtU{feoX;PY$G`-phvm7+hV(Nc)g>otYZ{yRy*Luml{Czp&`8pVB@ zTK$hoj62UX)Ce<_ra~@z-bRB& zOL%=S&M)Ob^;Wn9g;Yv{#Yli^Jdga&G|A8h`BelQP=52oP=EV2_1RUI%jVhxkyDR* z$zmD$*|TaXnh|>j2qsQ7x7CYUEymzzNqKoMP!eILIFVv||4I^b09n5uibO^Bhzyyk z4WdW$580~i4m^D(ZhQf+7z2?r@zpOhu*0+FczKC(reEMyCGMwqD) zTh(ySj6o5hQN~zt1CMZQbyxuoeu;Us zNVXgtM0NWrtg`Muw_)5o1my$a7#Vg%2&bqvXqOLWv@A*2B?!2WKM|-rv&SH$gmxf9 z8dnnyN~NUs4){DDbn10b@+7JCa-`)CV-7ZeBV^%@)JeoGM5q-_HmlPc6yLVKQ)7|i zpXP_hYD`LT^GzUL=+e4#XY${eBHAjY9Mb4@*!LLz&{lTUF(5|^_LVF%&JG_kz8U-H zRM@z6RiPmcJVR2V#GWHs

E%gkl7Sp4gWMv4&|uZTM-lOWOHIQ+mutc`zEhe> zj~O`!4*ry{qVLNSatf0=t+dk%9l*M^#-cG~Ls$MGiYf82)@ zN6%)-&7l`8#&D7Dr!~;g_j9HLF@Hh{Jfk9+1`u@+!EYp`j8F3T?>1|n@EXWE?i!D^ zkOkX+5d`T`7c4>hdv--Fg;WUChN};hyDY`s8sCR{=On+)e{o|du+My=tO0h*#&onq+q(Bvpb+60buITZk|_OJsZ<5 zqp3Eu`tILDf*aiF(K;w1rxW{8w`@FMvU-&0GM{>VQGQsDv>c@+&-M(a?xZD{h3(3< zR~l`o0$~C^D?||el{l)Acj=L}IQFui!7NP@8xld@CiJ2Bef$;@CQYyfI8fK6g~ zMn${N46rV~q;E3^n`H%cD!O0LSfkgJrP>K6C1W+Ly5Yrag>7iwt?^dh-+HF30~7uZ zWEeR(^%t$ik_EaVZ87m~lMvFGKOK;`e6^nXL;m4K)1tZnNaW6d4Xqf`6fQ%z zAV{mwI*?P1^h(Pz)fgzuD8NuNuQ3Pq=bD{7=k}5JHNzhZU0= zjq7sRk?^TtoQB|VGBMfBR*xn*JeN4(qD9`=+JikHX7b=Z{}8?qorksOV>OGF;DatT z#ggF^G_;W@NcMZg&gf=-5G3XVKvv@5qHY*J>RoO4T#0BnZqmqCDvocZOy5x>LfYWb z=qgd|s4h?$;wJ^rgpaQ>ZX|4)-TC?!3F53qS2CE5_RK+6W~#`rIi%Y!=rbu|$1HK( z-#?O7dlw&vu^kAV>JDCwxwI!eixHF|L-vePavbo@>65UH?1(pOy>~}V58|YsK;aqN z`BZTQZsiwYxHn2$h6gS<2ZK%EdfF(lo#IfFqW2=TI*}U^YS`Lw!>5F%So4C4h=OvA zOzWCpL^d2Z+9jp`)Zws&4wZZ^PL!ASL~a~(+18uzMUZxIjHL;a#v|KCLnh0xaC`+O7U;5}etp+8!Y@3=jL&s| zzVOoMYSF%tZ?7|u8LvZqKB=8-&k(}k`P|O|IS;Lmx(qXCwIsPw{#@YgpJ}ae_8JeEIE7ETqMnfkRBrA(NSWe)eArV(S z6oM$sJwfJE7>UiEQO^1oL~Z!`MO`)3uz^QuLPDhxK3H&0WBt-RmAqGr%ae0WVSQJw zt5_U=tzGTMRjn?$pv>sCEEVS_XTY}Fyh}@OPZ6PT@|N>a+^U9s7Y+duxl7@#J#4~F zEJ`vy+EWAuNhcvPKS$Qv!naFevB+CBKkDgWV(FY3&dzPFrFI_@b@^U|MK@C#CF3*r zIqHfFC?}(rtcY>S*H*l9&;?0sRNepSKjtg}m9>p>iS=1%mK%ZR_J#gFn-c?2>K|=?QJQ^Hq4I7)ENEz;oSU9@^&+l1b1w zgDHrnOJNHO*@#8Z9s?Hwtd+Pf8`FPP6T@wySJIM%D^P?~s=yU(#O+B;V97sP;KA01 zc>U}es%kV9+YFKcNz52~#7(|RmuRCBkt_lQkLTBOq4%8-ZlH;JZE{0NJdQO4{CmUG z_~nZpNV&zdVAceXz>Oy7S5@nlO$(=QLq{f$kh$6RmZ$93pxX|d+qZdf3(TfuUr z3j7vy-{4!dpM8ro4TTq;GMBv$!^*ChaBc}wLCu# z(W)(;LOB&WCsp9M9xAKmgZ%h9M~1Cme)nvc!7w=$O!b~*UyBc&c=W35>sXQA&{M$t zJSvW-{mFsq-^uJM+Ak!a#43r3H=T=0EY=EEJXEDwVDqIhwFPdGEg~0o5?M4L>Tjtt zv3T$^wjxu6MU;P>7VpB3?M+lkVFtL{SQVfq`tWzp$x$}H&M(EVZ>xhF4#W=%h5F{6 zYDjEcWFvQv==CQ#>sB{(DueS$7tiVq&5ogfC!{q@XyF{Yn-}O)&2nlU;Y2FN&{#TH z5T%w+NHPcDLL-G&5oH#CHTWwMYM#GjV>_NHN|DV0{V4${${6)VWsIV*BdlgyeMJne zIWb^Xi8=t2F?!ps)t#G2`U>rtJD^(8bfPG&IB$T$kC$p*nkjO(-PeJNaS(GRaanqe z)zOpyE+6xyO{0A!U)}nNvQ&0_P>=+XRB)86nEc|TJRz^yu;unn7?K>{p#4%R2A#}6 zHTQo{-NcdbhM2V2x3hG!y(B2kfMg0{RuEtA_fCnjWZhfgi-3n}75gU*L`+)&VPw}_ zNc?hVV`yNX_A6NTW8vE@>`Oan7mtlDH||t+)`@vFQk^X9@juR4MgjyY7}1D*Fe|Iv zn%4^78w{}!Sps=+7p_F}R>Zf(80C}Bg~?k3i54Zb1d}YuNn{Rr$wg#T-Vp7+&TyCf zs^9Xo-L`U!0{#*gE_Pna!}1cS^VHOHUTX(Ny8E=q5g_8>rW7yenO31Mqgb!4KQgF0 zUFQq7a9;{R^PjS^n!^#v(pKndwT>&BYr_DKskhMT#e2&Sp1;M%%No7C9=w47aXW@E zRONN~4VK}GRZolShDeytb+6o{=!;3NLlU!ER713+L!3m~R#6Kph6!{VAP>%1Z_|g; zjaBblSy1}}ji>8XMa%V@)T;$9wsyVg>a*QPw_%%5ORvT+px_Tj72zi(=%f;RGf;~D zVr%Lh*2O|5Jv<;FqDd4TjbU2u12GT-YeI#?d|I^UlR3dk$GvLx7&pNT!K8%K;wf53 zl6w^Zt9=6z;k|>(DGf<&SVw{K!HgVSF72NpXs4){>a?_*_?DIR$&1x#qD0r`swM;4zTHK(>fl3LCLD^1($&e z5y75CPVAmw&>iTF(`2ri;@3wUNS~-z{j=T#`8y=JRRzQt$2G$4a+f0)&+L-H-6Uz| zR-jiz)sH)0tPZc;P#H?c7G2K4 z5sY&e7TJf-hBsXmsj(($4S1J0tivHaM2@ym2W_d$y7PGd>v0{jiwPtIw_yZ$bxsyE zBNLba=v#J8nIvgfP{#fWk2M#tNOWZx8l4~Fd&kiPmgq{Q%=@zJXn|lZT8lkc3e^Z# zTv2c3I^eB+G(cg3S`;$zV|{57^4raO5Vi~?w_1-y(rFXlwHhe)BaP$uk(%z4VR$Mb1R*UY%8vkHHWJp$K&k4Wz_(B{oifnx@ zrBrB_K;ej%91#CqoA+g!A6%Ii3(dG#ru&(;p-fr7QXkrl;03PaHo0)uh|M|&Kc;x_ zUA9s*iABgnsT5Y-|BSc?AG>G^C@-Fuv;QS8z7M!H0-MkL?9%)}9=4m+KnC*iaW`4! zEDH9YEV=(9f-c-Ci}ZI{P@WSq$CqGX5aK9Qo0-?mHKDy#>6z{ zkUtZX2ogF^R<0av6loR0fFD@($d0(AoO9=0 zgt&`5fOqFvnlYKIy$oHKYsCu7^S{)c`Ozbxk*MV|&8SO-HGy!*Gshu$@K1m*7%Gj5 zmmC!Y&CRNAMfv(v=!gagWPvTCn%;5n)$-_BCr3M*deG{KB~_qQMdoe&DE1pCO97{S z{GI@OCYG*ADV7y|mx@&U{I;c_bG@WOQO_jfwyA*{0)tI;S%zvhQDq-i zIzQ|(2Ub&Jz~#=uDo}3yi?>i>^X;h_#@AixnG50zad(a8p`XxGho0{^LfXp&O6~ZA z0IQdVsJq{H<5BynH`_ApumdZ4+s7azW-y_Eoq@r){CHWt#w{EPPk0uX-7XR%>KqCT z#Zs5tq@U8tw<7av8z*zh?q>u^?uGeFgtj|TMgBVm`fTb^ZIOwMllNm#dAg*p!J9Rz zUJMwU^%u}>{Tq4Z4eXNnv}O9^lmzPI3_F?Ar0a3b#ApV|r6V`9bykTqQfJN)>YGi@ zn6&LetfOVPXZKN~*PNS#mlPoBIQhZHxcZyD3~lqwnBE`)Z2Rq!q*QSAlu)C9MvoT! zYkgV=mbns7OGFJpj`V=uIlqeNgnV-m3&n<)cyQ=6HH( z1jcehlQi!=nWAY#bxaOw3-y0PFWVmKT9tZ3mU{JZkQT-K;ssNy@B{E!AB0mzT!gU=@F$?Qw+6Zb;krTz&1swh3Yxr}P|uOOeN z=BJc)wO1>339-IbvbsqJ9ECTA8zTtP5mgb5Boi82-D$Xz>VVgCj%t7UygXN3Oi<{xJj4# zQF{B&?gzgno*`FJg8Py$%IM5Pq#F=9lzDQn_*|q%5=~#*Z(o5M=c3+y<5cQoLv_9Y z>68-sd?CR-LCvIK4a?{dSxn-B#YLJoA=?N*Q4u5(oV91y8IC8`=MVjQe7~GXV2rH6 z?rTb9DsOwuXl`GX%o6WAk$s9uDSjH}a3CiMa~n`9;2Ax|yh*hwfMjIad8S^Y(OM>S z>_#ZIvSU6RGFvW5##Y^H`_oVOxJ3;yid(vuAQ`?Phqs1{#BE!Q^_WLOGr1v1AKx4N zAqfP@d2hqnAl*t}&B#YLK!B$H^Uc1=k?YIm%uuY&(UcAlWOlz^PX?1L_Yczu-Rg0+ zHIM#hA`Qz|hporYn!GG;=1A__-pM$|U zv6YCcnhdRHl>ORSr~Lg}eD@~JNyz8=&OwBR*Me-M&OEwbiV(kvs0U-#o7Y)#Ds>-+ z890a-oba^_2q->7Fw4CkgXSbx0P`&*6Mp$eZE!yq+}pJWMDa$a6HIJwGZcxmZ%bos z(YW_frKm~II|~tgfBnBhDP;ZbeH2Rg^yiorxn_nv-xAc0cCna5ag%WStQW(B%yqo! zApzzxh@RgqOiFFVa}WUYW&iI;R0bn%wm*eGYm~{e>7(SaR)ppD+?}u5`Rh*9kXNRqK2;oBgvTT&MIW=(xB_w@bKeciIr!k35c$-N4DPu&N#g90pX`s})yowl zcyIB^b3=oUvt%E;%_7UkrS39KPVCw^??V&zHjJ$8RbyNxF^yGCsj%gkdMQ855K+TU0f`qWZ2RAojp@2D#NUU|O!xTpCP7F!> zF+j{F2SRQ@q|f90j<*IwP^+0}jm#_X)lGuegw?$x@gR{^PAgbJSWR?@W^J5<@CQbG62m>hL(A#t_Bu@L zDh1b~&hPx^i#+(Cn@@f-waoj;%turQ40i9*v$GN028s-)PF#A!7F@2R0dyCD>hdaF zoYSZ@LLJLGS7mF|{sb6Rox)b##sG_l)8Z{&*$Kievm_gF&lxsL4SxiS!Sk`tp3xP! z8@|WxCs}zBaaZ?UJ!c+>din$+CB{^GKnk^{IiEP(IS0sv%-Y}7fh--vEmbeAwhw8_ z<6H3eB*}#TnJH@l@ZbYQ_pw`odMM+#4RW$pO!_L3cR+~@LbF&nY(^%`wfUdi)MaJ1 zY6J?yu6^%xu)4T{T6J6VoXH6E$O@aH$+|HNMH^(b&{e!~wX1 zc1>bibt2rFuoRaqF^I*%a9{Xg-0H89-WC?`%Fvc30v%CQ?UDH^A|B`fQ#I}SZ9etX= zLltn%BSTQn#>EqOeRc_Mtlx>`320SPl^?hkkom_OP&tt>e+5kzeZL^839YBM?0&gN zS=TeR%tCEL;>G!nD5l@vSLKoo@)57wm{<`o5T?@xjOueVBK6eO)^5TN|2Mgi8_T7X zC-a3m+IoxhM2z2C(K~v*xiidM)}wnL8i!IAJt>rHJiq1m<>JHZVg%v|0mNq0Z+*|E zup-&-v5d$=fG?Vejr0=clp)&@Qy{HJb$kwX%Lu}?vK^Dh%w6tmE$TM8&iVlXj|gy> z+TU!-h$=zd@8eEAzZzUhl#iH+kizf0BU<#|VzgCaqA+2tiPZ$GUdr<8*iLER?^(Nn z4Xz%3m){tP-_*;V6_YHu6{ZA>lo8c=D>e-M7$6GX)9)QY*NrXjN9N{Yuo6%KiJh-F z+&4fv6Jr*(?ZMN(?d}53weVQf8!ib@;_h|aE&8P>Zm&zjZ8si7g0%of$17*-zOA@2 zSZGzE^>MkQHvM9s(1RdAGdT^8uD3dt2otvNt4q+qSgKZRP@Se<4?T*NQjqR%8>7C+6l^=VfTPhJy%+0tTXkj-T(OQZY0LCx$B48 zS-Vc+`df?6B9lyw6gd7fiW{oj`5a7N;h$#qsbyG;0X?Wv0X>?s>IKspxha*o3k@fd zzvZQx5GnF^p99}2jn5miVQlWL2;m6i-XT{O^!M6uh2hR%L|Q^5$N^$z>GvO%Cw_b{ zxnFAhwfKvCn%=74aprVy1{~Ai^QrHzHCH*-7d&lMwq}l?nR|P@RD#?Kgi7!!-W}J= zQx>cN`xb?u^1u4VK;!Z(Dlj=cF@-}8e;rtJzBQzSuX~e;KS&Ig`fsJElI3+@04b5Z zN3`N%VM{Q8!pWQC|Iq)PmjM~cy1w~LS{PE;)7!|MM#Y$u#~&B)iT)`?Sp7qAkZ!=A z^a83TS{i<5ZeUqx1n6Jh7|=c6H&`6I5dblMp+aNFR`MmoZ<&k`#<4}%)B z?ArStFT%}VRD3XHZASq*kLZvR0J?$+TeV7eG1#x$3_e`AD3D>iV@wZfvV|FECnOp% zx30@|U*1RHR(J^3t2?C9i8?ztkH+uI9B*0e!!G>N_HGEfYH1c|*)lTFf{%?zwOaJx ztt{i3Bon&D87E!8wFdjP{a-$Vwa~ME)}8jP7KoEnfFVN0$L@!EG(p`4NxNtB9`V4- ztJ0T}-di+#yJ=-_W^!isICkWeSadv6Xpq71oc^^6Aci{J4r6ZiOb`O;R^rgR@GRBi z2x9$qmGoMN3@>87LW#?9sp@>}ES%9QT={+{)j1&=QF|Fr#g8)RhC-A3Bc5RU*%GcK z3rNDE-SA5o?`hB#EsBJb)Re7>A0`N_d-~r_A-W9c><Wh6LJx1sn97R zeAhoHtkdgxp18ezX*krwJOOWCs#5E9R7IFjEYp^v>YU=nFGPqlClO#?+6<#73PKRn zW=Avg62sYG@}PS(%~nE^-ZtTmgg#yW8b$ToOEj}$x8r+*2()>;O2~;#W)6pbOH>N^ zIHo`U^S=U|VO*)X0?eNvd3VJA0<(ZnJ7Aq3Z=K`QxvIJbekYl7l`J2Xw%LUNwHTrl zWX?v4EI<;IsV7^(RoBne+}zvF7IQlv!b@LsaFW;tnT1Jy^~5k#QVpTI6K9Kz1Npl( zveZNMcO-H}c$78-Q{KSM*53|`!9L@&3Mf+&Rz=1CCE&^&i492Y4v?kzt!R5u1(dHy zTFVC{07P5G#pA>&pFb)R>yfrdQffE*EIG&FK9Twg1Ji-3Xuni6jZap%+*jqMEkGQR z!5F>o<}>mh9}GZ(BaW(|A9lC*RKV440&Z|Noa9ycATvthnT|O;>~VVuh%6S> zAoLO>Gb5TLYYchAlabw*1Nc{*eB+UzpKn$JSExd!O>v^L)qP(cU1krD`Y8u5I^L~6 zFQy0A(P0+p&`r4ODzm9H)bj}qv$=$S`1T-$MOK(|wR&Zq4<)x0cZ(^|1VrwrVI4>{ zdMHWbWql&kwHhIar8ffX=PtVYF(jjw+O|+lW5{qGneSH}>d@u@77!k4?O2rXE;WoT zC&;3wM!v-)Iu?X`Je;)Y`EAu?+HFQp=(LU0v0J-L^^rfU?b8c!q@&V%7zhkJV|GcES_n2(fm(C)~;Tef}(f5h0%@O^sX3I&gu_7%U2 z(e5-4|Bo*^Rv{*GSLv}^YO1_vry2{Lg~_T(5$SW z#b9OkvYk$ZkiamNkKXt@0+e#KnFB)g9FuG1F`40O$rOxK z{1t0bcROUdEMDCeMbYMIYo^Y4rkQsuw2jpkvngP`&Ysp$NoycDHPM!gbZ~LG*=ORC zFYP+6L_^5feSk)@_mb~)ku7D!JAJ_+LX8eb#umeS|@V+Zy@25`Kn*JL8_oWg02!MKr0L85mG302uNSEQ}K}mizSH zBytn7EqCMZHth)Cv9H5Kr4Zgnjz7;w*p-*k7*?^h0BU2sO) z6tycGp+*F|wlIM&6n#=T``;RVI~8C|@*5x#C-bzA78iTi19 zE5hGaUvQ((j|G$_4xE!F-$DrTsGo%0l9HeH3@lVHDzuWKH%nv*3aCu0RRA z**7*w^-sLKcw+Zm^mB)M{C+baT*E7%u_!0yJ*w{2fLFW~pq$}tG_2HCvoRYtp;u23 zE!4e3MTn4*C#_wH_z_2yqJ72X9?`o!#;ok^&L1;U!muouVKO%DpSonyE`5o?R|x3> zTTsg`Zs4C;>*6*q;~qpsxilI`3Y2r?Au7RKc6<#smTqDHPt@*)oi+MUq14zqgP-wo zBy>iocxc6~vH59M0gEJDEuSy0KJAcUu=VEwx~T`O`W|^KVR%d=Nie(uK*ycdJSU6$ z%IvStlp*!^SY3Z(pq!I6^vxYOJf8d=#Tk`W>TVC&o5{4!l{=$Y+!v2o6IQlN8h)uS z?$5h8_#{`|nXn943)m8^tOu3ts*Xp6n3{!K;0aHIY>_pfYA{@geAKfm%~+h#x648;BnVb{ zFz4eY26)7_L^YGpYHo}zoifa0)x9~=Zkq#niKL^_^66R>aPsweL>$p|k*JMZqP82j zLs;WPVe3l%ywjydp=Gy&F(#Kha=&cdnhYv7;E_rJ*O8a55Dw1e!EVo~JV~Qw#ecL) z_!gQwJg)g!k5tNq z$yK)}v)0!!o$7h33+8vDkhwBdn?o4WW4P3>M0{)sENVFVNr+KDNS=+GAC)c&-@~;X zD`7E4S7mM0$A+>q>BsoRRd>7_XWR&Wobnwxa8|A?n95bkGvPnhZM1AMJc$q{gPB%k z{RSWX>ExESF!0x??Kjn^*FtFAoWS20*ow-0?9r#RdMAK?H@rMv417lbqr)d@#eUkgF z8mQZ1(a;;29J)u8ar8*?=q8<_zdbha(Nk){v92v>TMr9cX=Q3yS)jqR60SY-K>D5t zjXV;;4Od_G=0>9~?<0_5j3mW!+RJu@4NiU`QWuuxk-huY+7AG<4|5aEH7gmX5)+=e zDJCIb9VOB{jM^!I6fL`WD zx!zqWJeKymywl;a$7A+8O}~+zU~X>yvC&MROo4%?n_x81xOgqjX)`?d+MWnWb?RzK z=+nsp?9N%?I9E9OGgGDVmN1%(C+O>wbcAZV9E&y&dot!{%{dQNzZmLX zcRYTv!^1S|&mN6MhFX`fGpPdy6sCnQE1nc~bGHt4wITqF3a_mfZ&Qy54UTdduTCTZ zg(1C`+p~CwCAi=O9?>*3>KyUtV+Rv;jov|6s4WRGAEZkj3`-#o_FKW6Ai%E5o7!1a zQXY#R<%8s<0)`kz6$asJ=10;bEyI5MzrWJm{OMN@J#fPu5yb-`iq?Nl4@8@&qnh!R zvG9#Sm^pJD{OA9o?%ze)igN1XfrBS$>%eN8{A_h=`7V^a0Ko35pMijYIe`JWR$hPc z3S%ve_QF=Wz<0Sy^hDpfh|er$%n+zX;mr}2fjo|)*a?!B-{D>{FGEljTL_5d$hdaV zyt*Lu0E_qc=q#bqQ9=Xt^3E^Rx_wd6Wk_Pq4@F!o zxEGlNatnjQ`Fm<5yKBdAdW(d3G%KZlv!89tuC$7Y_3UU8(3ZpaF2uOCqo<9yb^MbP zRD}0qS58FY6<9V~IAorCmi^&O1cYJ@@}K^QR0R3CTG*Js#78*#V@?1fvG#RU^dEk3 zXjSTp12-sR-9g6w2MM0=N9yP3J-ASh#-sZ;256EZ1|*igI5N{voo52Cn~;FpVntUN zLHaQ$t|ftvm-cR3x_Z8ThoTOKd{zw}VKY6OxNYhXns%xm-*dO9w4Oz+kj0H+ud6bp zT@q<-m~(|0G&Iq^8T|Dy{aIG?K4S!v~vcWgaC{r~kltZWpFUMcKb- zmA9t7xU4;#%~rZt*pYgT4kEp!j>|bj$Rl4w%n~zl`sMtV-07P9JuRw% z98Q4XMMLkmG5G&}eio|;8PDG|WQq-;C$W{6?#!g=rVg`9w8zNw5LMrjY#K1jN?BCB z0(_NNS8_=+DDK{k4Hb1q_gNoZz64t_v*DO;hvKsi2G$@&)lDQCzBKBIWiuZ!&O78?e30s~9CvREBfNuThns@G)3jIy%io;sHZBpIIg9b&yD^ zd*D50&P#@EAABV@7_}pPXhm`bNC>O2C77VRqboq#&tXgAz^5n9B@gaPQn*XQG%El0 z0Zh6j)4(hzk!oO7BhDoWYG-NF;@pKb+wZD>IyT4)&_@H4MoT!&%|TTN+ZzKR-}52JCQpwUF&r(kfTKphpZ9f7G|<%sW1I$zieQ&oI6&{^*@9EtI}s z{1D?;-ivW&8_#x=e%ZWyHeF=3=15HJTm6(w9d|{3x3c?sOEr&O(ibIaSFr^4p0x*Y z<)hn&s*;FtBIEZgo{q~hIp**pG>0Iqt**Wi3T5Sp;8db7Q|;v?P0}`3;;# zWRJcRM?ph0$zE-M;8}i@0ff@Y(Mj*48{CBrn!ZaJNqRQ$uZC@1A@wodIq2hzygJjF z9(|_%;M#=&dRQ4o&$P(^KKN)L3P&DJOd)7MwnVaogK6B`S|fob8tJ zBvveWjvy1`9|qXZhrihOn!U0>e$Nrk(15(=t4;$QszOf4ivP%OHlg4`^~yBe&apG| z7co5$lMM!oF1FXkd;4zgQHKcVHz<}>>dH#w9!eqxJ_DM%ICP#p2fI}>}Ah*-C4=yH&cE>NG z4z3=#8z0GEM=e*9RHAcpaj9i&B(x5itu{*pCXj^vFs(^S+m2`y#81}P#(sAw? zA{Ur&JS7iYNI7zioC=5RK9mIYxJ}_S7WqT?&cBpbm^~Du(U`<59SR>mrk$!Fv87HM zr>HqKrJQYPbS1&aBL3AT?h;2#A;{F*;r+do9PSW%0{TdzX(sabdw`flREE5_B6LaE zpyHh!fEE0b1@M)Ji}*ktEW-S~`>EUg{zl`Vor#C!_%&6uGS5Rz7X&BJAZx2(v#LRg zt|iM6z}E-P+T9u~&DwP_SRcm(3Ohq>SV-*_n#IQYNXH;3j@>ZwM1FjqK`n&2R#7HQO!!7J!@y5B)Y%hjE zaUPPsr`fk4Aq-8T%IP`}P4iZ&r_rkx@As({fT;ENp_7f?%Ls?^7x*`=96B_#YW<-w zzJJsR2}X3Yk~s#36Ps}MyRo^{w8#7rk;>axM;xecuS$q{kmL?b`4cG4aK0V@04U;5 zC${tfwEXp(Dvhrk?$pBHu#|z@^I02pIXADAyw9klwECuYw(NW231J;CKVSXxf|Dm- zs8Lbn6~Uqh>#mNQ!mu{3Waq}~wBpHXBkW3;((E*FK{3U2$Q2`kuY+SWq?Cb);@3K< z!231{x@lkbi$rHVH}d!8zG6`rYelVEPWBJHRw)NKLlI0KQ~hYy=5i=cu?-I6^oRz^ zN^PMP^G6QZzaI+6pDo}T$O`~~002b0|2hE5)^ild`S$@i!9NEE=Kphm4*&p-@TZ%K zmGYaWNg{lAdZV%Q)NGD(l2l?{m+NBRZb1{p z6$2@4c#fD1ip_urAFSsBq1;1@^@zdUhjBmEAUt) zQIosE<^dVC1_-?T%tAP7ZM7SgZ@>_RMU4`i@Z#Q0#jGJkAf%H zn_cX{horuK;mzsUx~hU37j#&}u+ZB&gRF$p(v9R?<1iaiaxPkI0B6+1d&7747Gfv5 zQZ>5QB{ss~pD=LaV=O-o3;_A;Ui>I7Gp7*o1!(XOpnM@eJq;;mq6RtVYW#d8dTZ)b zPDdM5<>7{wsZx3AoT!h5iMtnRq)uRaoY$ZCosnrC>RUKuW*f2{F7~Q*oOczO13K}q zxi%e>zOvO{87HnpB2FU*B2>V_o5sSr4!ljIEShD0x8T^jLF|4AC!g~w8R(0F6Kh|O zxl5JpB-#1M;`)!BQEp> zYWOGGMP`U4|AhZ>~Xz^gilP z^KlsBwo((Xw1oN2=Y2ca{)}^0i1X0?LHu;5Q%;iLXWF|`6u}wG%_&^IbyH(PbxT~T z4vx-m%3}LPXY8!t>+COhad^%c zRGio@`TiL&rJ#41t<MKkqYxrq@%uMJ?LFtd&mLKo(J)jnTYPdQ5Kd{=n_e<13*1 zAxOQrhQn*`rDS+Jo+jEFUtHS~jfsg!%Y@twFfX=V0~zA3F(q^az&wngDzXwHUFvr^ zs~}TTQlshWPdS>Amg;UJY>fRhP2rB_zxB$u2utG>g^4{vaaPZ>^LLgMy(sOBt+{Xu z4*}#z^+T4wt*(KWNI4ycei-nG?bo0I^yU|5_-jlnjb!Rx1k&woZAX28i4Fsu>&*Gb z+!#h``#2xh(^N>PA_+2R$wyJIF&@)m%B#{aZRmNq%>raX-}zaK3m*eczdJIw=v@Zl zeF#4`F`iecEID8x>XTMuz2gJRDvuqGH@*KBc$p8xIfX3LFC4HPiQ9zIDbukKFgX=Q z2q73SBS{d7Zcq;dM?&RWQ7ntUeRi|C?b9E_~Kp6pDs@cw4?4-%Ul)D-G03!(H<)g_IR1bsHW7z6*=^upG2M-hn!e=_=nG*jWiCvArXna)q=pleIzcs#_kmFhRIC$x?{_V}o6E|nARO7Cexw^q6Z=DJ z)k@=M#oq)a+qbc+qOFtuNmkdMHP}&bjXpWlSXY0KHIaDBlbAjzX@R;IHk=-`WtvHP z%^>>7ywYT6LZXCxG@X|F^k=Q`T}VN3%fg;O=qG7@wMWvXHDlVYm5(|~5W*d?hr|CImDzD?TdI3X}0e8Q#{amJ~d8L)qapPf$q+f*G)l+LMH_~=_Hc>Uuhpe4E zkv!9TXr2RUMO4YJSJ(2bpe$g?p41`SFb;FQA0EzB#5n>n!(g zVC2C5FJOrBXCQD9NEnXmlJ|Ur1XLH4D$dyyJ~D}fOk1c)POue-R42?vR?(3&*s%ne z1i&}VcpgVTlpSDk+RpZCAU1B5RFx8=7*CoOlxrS!u(CvTE?j?%F7W3;udLLBpD`RED0_>qx4=l&1g6Flb%Dlxq;0 z`O_#mW@4n2P~PJ?G-q7}!+Ct!bHx_Ym_xB>>U&0C(;FHJQsZBL zx2qCx&jo_$qd09H63jyCrO2n+b?D^f6EAE8Mm+hQsHH41Rd*us>m z=1WC72uZy`Bp>mY%gG?6Pfw{8(|5cL#_bsN&ZuzyyFwW~M}wU$N2Cp|?merXW5bFM zvi&`qS1@C-^h6p%x~vcXdz1j)*n?=v&IT~*IITm8r@n+7{R!Tbm#@)W2eaRlB52n| zS6XdmzlETAf?NxSU18N5;AA-y?bWMa8nO07qQ0E*+p@o|F-QJGjxss6mU`?5)#F?N zvKGbxm(`U$+kxmUpU_SPZoC{pW$0M3+Ayu zE@wVk&exJR)O{Qc;!(ok(44=?m}5A<_q89h%r=>V@>)=u;Y0A+1bE0jeLtV^=36I!q}dx<51a^<9U3bJuJE{5;T65BNRM28UB_HgfFJ2VK{%TvfgK^zsmsFNvLhY{eR;7FNN#IpW(tq z@XDA9<^uqSffm~^XfmV;95rQjyDaecJzlA8YDb}6vn4Dgl|(#+t!sndfhuDxoTguZ zS4=7Z=BE1eI+R4oKt@D0QDi^VT0X!J*LPT00)VZ)!IJM&hPBIMefKXmL%mXM9w|x+ z-!D~*kZ2dF54c;G?M3yS7jo(PJrn7;-jOh?3`ou!gtyv|&Tb~-(yX7J;&_q6YUei* zfxK2Ve-_W+z)EYBNOd540)VH(^K2Diw}UMmA&20=w1Ky-5wz|qL~%5dxrC1C<6Nfq z(l(+#0Im?0aAYbEJph{} zuX^^aARgunpg#umbbzx0lgp%_8z%^4&jc(E5e8r1P83^)4dTJwk4Y--m<;Yp-wOsd zPHH*cQv5OyaqEm{1cxK#`z*82`e}n1t_BKI%QDttWzaIF=o!g@EfjbwpD+`246n(h zi}Fy0Wqt@!A|A-XSg%>Q=1d)N_U5fZ&rH6qNH?K*`ecnTBIk~(zSk3HJ7aV_z9g?9 z?!4BBDFx1W5`tteK~oCnve3ctn4>mGS##uV><-sUn5BjgNrO2q~rAG%1vEZ-j%Zi z%QcPWBV*||*Ll$dVlO-y<=v_r!q8Z%8%*ZeTRN~#Q^{WbtgR%$%j z5UdE5$B}mVlAkqg%AR#G7_Zeg>|1}JJoHCJrTwR3ACM0S8G1Ev5it|^7f|iigV%no z=RT4BeXJZ#wfmXfG7M_VER{nlg77e)ECvY2AM91S?1K(7K6k1*ROD@~yik6p^IY zpP(eOQzL|Sb1BUax)Csn-W0i3i^AC5f3UIg-%tSd=Fixa+JzKI4)sJB=KJ6S?wLkW z)$Mj+YE%D|FR-Va2?O-*A;-35BbriJAj1T(xD^Q=TUJoFUp;Z8L}3_JogMWPDAZ>E zuxra8G7R1BCQCrQ4g}mg@*JHMb)uW40dfBE1i1_A6m#Ym$~=*yxCGUWV4)*P=hKyd ztFy>WHu7mDf*{M9dk39aZZr;!vT#12{xiYUTGdQaQJZ4#wDQ-n<@U;4It~9AA8X)) z@E!_ZIFbNTNC0DYy*ygAF_Vrm;`mCO54I~!I>`EnssOmzFPmyxmmTvWk>HDuVUsf; zPnE4E>(cm@=|^~f+1HcVc}I}o1|jo%Pm@-ZO0*`-J2Nns^T_1F&^1+3EhAO7R)m&* zDZ}HkI^P{EOunhh1o6pJ?_a-oL)yB4$5#XZW67rFDFFzJ*qLi0R?e{f!IGQ_%#a+6 zvwRulE&OGnlr8OzS&{8D%bp+o^;lt!krps(EL6mnd1vg zYVHz-17e1ys=zOqnU8a^2Po^%$jv0=$wq5~z|_*zvQZ7#4qryZ(gfXomJeo;5SDQx z(iQtz;AwMo*o@qK$X}0#uX-GcuH9y6*dAeU#!UHPEQ3}Lj=&vnBYlI4jyV5Hsi^!p z?|(_`CIo<<;XTwEk;34^2Vl(mH(0YV{|}L6WB&)ef3q({6Ms68RH*=>dOl>Y4+hW& zAISG;BY!eSRiBsO$*nhBYjkwui(fCB5jM#3ezg?$`~1Y-qRlEfv6!D?F)(*tNuj7@R#RLfX4J z4&btvkc6OLH0h4Jj|u0{dMl-^533}0Rb?8LrE}7&5*))C+8k?4>w=(CXU+`dA`9Aw z;ai%aadYw2S7F-XDidEJLsBJ@0h+LNg%Q(wK_DvTPELCmgU(;5 z@EXm^Dz%{E?Z)9nW^Xu9#f;_iDBZwFg?rWW85-qu2~XZ(+p(JE<&^>pMtZbolJ$zo z;Ky*GLt&ntHR^LwTgU9!3F{g}B9sW&Ft1Ss)>bEajCcVe8%w@%I#h{XF`)WyegEFA z{H8+(1-v38BumKNIJp8VA8o8w7+{`}?tm?UW274rh^>%CE_h@@Bw`8aQgUKrx|)Gu z8G&rn&6W1B?2#F~xp=q-={U862O6zeogb@IN=YN%SeR_1_d#M0cy-H)NF>|tcZat& z$!j8cqO;-0(%-bv2$N=tTePT?^%2ETqGIz_(;fjgY5UmlYl;}()i(yiwZ%oKSs5k9 zb-uh$l@I8>D;|DSI;yJ-M=9UP($YV0$AaHr3x~V}s>Gy!;R8rptTj^QWh^Vhu>_8(C5Y>h`;~0WfEGkdI^ok2*g^gF_f9x=XK4~}flBZJ+QPx9` z8dqc%SpiE0`M_`OAMC6DSlSLXL7G!=qcN#DA(9b6h!^!3A%sMn5KhW1wmTbz7WIwK zcn`neCTmU2Brh2_%4W+L(vmY5BE&y0>0L_wRTe-16Z;)69^JYFq0bg4t8jZAV5+{1 z90rQPZ9z$audEXEKoEP=yts$%*Qg!+TL~$EilzjSwEB7l->!nTok1;g0GN1_M38n= zbcj80e897m`@&5oh!@Nop7uTc^32X%-5xtYp4@%Kh>2+s2PZy4IJ%!1Xg<&oXX~$l z{?)*pQX+yeD_n9goy{irlPm2KfxEHl!+4{tUZcwOIyPl6rg(KU2?9pwHp24QR?WTQ zC#75K3V!qtw{X6qOB(8E1O#g7K0rgv0oFudim&uyGd*G)stw>=&Tyz^m{L@rHiJ%C zeBwt_GGRRBllyEa6x1)Gsay8^d0`CJop@QCXoz#JE_O|+judSL#2vxmlII^V5j$s=;>z_N*}))tmfVk+`eShh{*0Qs znaV?;LP2*U!b^q;CRpAh>}B^F7LvrxdB>GFuu14E-t{!mzjo`i7Ila;RVwDj;k@nV z7wDLkZ1D%U;E`CdOBXlUsWVWzi<^V&_N0vNVk?iExUeWJ(3`9W@1#SJgBx(yW|grs zRJ*(3E@O+m0K=e4weYf^^}v*s;B2Qo&;j#rm?(YQA$-gDW?8fi=Io7BOyXoeHBI8s zyuZFie`O=Z%kb_c9^Y2-$M86`nQKy~n)3%GdSE9mBNT7K$<=ZrvH0ilkJwiM>wi98 z?K7raPe`rs5(L@9B;W~E#M-HP>rQL(6HJZj0(5u;Y7*`<;Lj8GeMUiE-LB*Mr<4!Z z!-@GlT_;3VR_^rXQFydR)mHMuj{fxCyYzPxb0L1ugzgW?s+^ACzmXt~G^@P(G2S z!JRv>VZ7qBJPQ}?Z^U5>w3+l%8ADM#bX01vDo#%;QgV*5B3IgU6;o3eBTv?INgs&o z*R^w=Wxa53`o%isKha7m$)K?n?Fo*;5arFKa7hfywZJu@kHnYA?V%&iw8?DZh@CQc7A&16JOCKyr$p~BGw@iO))z4Xq zvJ$Wjk5)&aJWH765AZ*4k@o3vtNS}`hgOfCiC3h0taXs7SW&{Rr1urZtxfT)yD9^% z3kf%zs~s}J?}|LoL|c*)vLkOc7{#9G_?>#4Q7)svLNaMfDh{Xq65>cxWO58;zv|mi z&AtCLWpJi*TX?a=9v9zm>SB7Q2hd@mYorzp$ORp!YYS{X@cq;{Y-V+PCcyvnWPbXP zP31^~nI4j-abV7Q$!)2=Tk=88R1uv`m^fAPY2Ld3s)R8L`J38___KKp6Ak}Wxu7T9 z>KV@%&qYGCQE#_@HzNPoU>iN_g*E+00cih60sKw)|LQRD{YL@d0|1~A{ON-KQUDtA zj=o^L-z2#23CsXr1Vz#Zw*4VR*Je{TCPYi)1|%C6ccH4KU5lR&OuW@8>B$dq?lqI8 zFhaoyg02iWB(}}^CLrFn9hFU7hHLG&Nhn9J#0w~ zkr3vr7~;V0m}C-KKX#mo#Ilr7@u`#cLp1EVSO2KrYGc<**F8-gFu@g){H;0AwrvT> zfi#@9Z=vQ32~$u+U1Mn++IVvE^bVGhz<&r7PQj$1-%U};f zbj3rq7+e2JW>_~2NCV91ZECRxQ>Q6`(|1ddsp{ogxnMEP_hb;<0O(RgUA|B6tWMLE z87=}HLs_BS81@F`(Rg6)cWoOa(<&)HZ-oa`j~R$qu@?Lga#djWJ`#e*D%nXoli~2w zVeji+>h;BT{@SuQFTY_JTj6Ddbb@GQ81j^ANJc97uhQM)N;+?_W!vjRg$0!qT&9e{im zgEk^gO)EKYmp$tMI{sMkt{<@pdcx)qg_t?X)2+GL0@axU5;AH%yW(vr?{;Uk3(2A5 zAJK0B?ak;y@t~};QKt#U6AFxuJ+RG-I~8w&RlARVZo0#_^gMlFDE70rbpWohwqT2} zaXk4lxc4E97eoSMdDYHwA615bL&eX`C~RSPaGF=Ci2UdA%5qX;uIEp_sq51hUfltc zT02}D(F&K4A2B?eEnTSP4e@StCGh4)s+jAYFzD$6^1+GgEimnq9ru*n_^JuB8e=l) zT;@0P|URDMfO!K#NuC$n*yRPS}uzOZG z_lQ}6pdDV+zVYD(yOqMR68)w>Y61q4{Gl_g1kHbcyL{+Ub4+YMkDf5 zW&~w4JdAoDaa#p?^Cos7$6j7L4x7`e)P=1c0;l|zp!E}INL1`P*qW7++mAz8q zg&xpJ5F zc&y4{V*6dr|Le?S!%R0iZo--1hp_<$It5Vr`V*8#8}7}^J7w3wWlAiD&igAGCpzx?N|`IosCj+JquZ!ypuc z)BrsDJO29xCcN-e`3Mb3tA{>Tx)oB4dAT6p`Jg63+}E3Oqoym?Jy=3{?(iZ_{Z z>T>i7NG58y-obU*vEbB&Qx<2*1~#ud5k)$d*9b?jN+&tVUVw3?mxHEIDAcnrj0~&Q zfpxcw@amLrW|S09(7bQ=S;soTuzymLq;c9^bsP=U^64^2;wtQR$B)<1g2f)oTa#D* zN@3ux2&+m?%2uS+fvg%B^ca}aS8F+ZviW=Z_VpzN+PScCM{gJWdL|mdX+BTgP9GHW z_k+X+S}I57sU?Cyk2SOahU(zVNw;kc+3)SmT!m7C+&9*9Be~XkXMGBhj0^!_tZpj~ z$cr6>4xkiB(=sWz(yGcjm>IRnkK+eZE02t#mZ01>(yc7te0JWqr8W$-a`Zp=dS_#K z-G;9k5ynr1I%EB@d)=mzitpo>!G~!;`wD&Muh?bEzAU{KuMKNW?$2+JsW~H!3g#N@ zCTVToMv$t{9GS8U>?QJ$m1(+3e7Uma))eAXrAuj^sE4gMA(yQbJ`?2KXkAyVIF(|n zL|v`E{#TngwY|*7^o4o%G3!OBmwq7#jDfl7qOlLv0KH}6aN_f0?{jIx_Th4p(~<)q zZ`&)O_|@Zh<(LQdX#06xF6^@;iDZ&@ypAgQ7LAA-Jty@mE@Xw6U!x--#ry)a2r8{e znWXbyb%H&M;x2H%pu91Jr3$s^4VbA@1gb6Zj83A5IT@s4a5Pluvv-q!;92GAP4m9y zlOBqhR9ZtUoPe69fGp6=!CaG^;dMP9YRSEJcxC&~Y_}p}5@W*-wi7^%q(W%JjmWky zV-2AMO|WB<{&|drM|85VuSZ?Wfh7Xp#Py}6@cI$AFD^ukOB@a4=``u4;vj}gSM67* zR!26Gz&0a-`z&mwhGASKfCpLL&5g#9thNtH95nmWgt?9q^0Yxwv{+^?zyoF?LR>sB zF;?`Qu^8O&H{ffjk{|iV6#8x}LV6I)H?()xTexuDBtS*gRcrA)HP>eqME~lMa!;Nf z6Rmd3#%7uEH-t2mlkK?tLF;i;=I(!=&QdPgMJ}uuQ z5@+JeR4z$w+fW}l77SW&4R6Bn5ZnuQyx`fZIFY~%OM|-d`GPLcK;CaWB*?w2aU=<{ zvz~FKuI70dCl$PPpvbkjxa9--sEs_QU|SLR_LcSWt746u=Y+yjTSeAL{`KQbXu0wV z4WCp2Q)4vmR?t4tiDouFE}L0zh`;`Nq-bw(69nC0M~0q?C5(g;!oF#LEGEoIoz?iX zFK|Z#JDp~qU^d|Rkfn?j<(Su%`gSU~Sb3~}GpYAg+RpqW$EM2ScAy*oma>8?9@+Bs z`Px^F-^U(G0@+w=zwTA2V}x1DOC_$7$Z!24=tEayig02ZP+LcwQvzY!9JlDOgWoK| zwGZscz#Gc|!Jc2R1sJ@SInq^d=r_W+ki*(t{HPEzqAuZOy7{xK6yEtYGOt|*<^d26&IAL@AGT9*c@@~0a zt#RQl`IHHMCSTUcIvBjIKe$U78h)xZW@^gSU+`~%Ms~M^2ECSfu&H;~?B3+Xby9M>4VkWW~-eT_tj`A-_6qNMWWR zwKP>ItJHxx1-Z_ACyf-Nn-?KN?ds0f412-A=p2{rxMyny_0j!1we7><0!^+QK(arN7Fc@nKZ zbDS^c{=+59D%WvDx{4w2ZrLi_2dMrvVy7uA^i(KfN^73{NuX*LR!VaN34WIY|Kn8e z!Dy7s>v9!`$iu6p1o(Z)1TvFBvmm0){L&w>r;Z+iv%2l!wPEy<(^T8KQxA8={`)WDM~Ju5RZ z;nj$`+ZVP#m$^C(h8D=o&stV4BW?WtPO-%0dL+I3&Wq5Jgw+%T9(c5X_O&3KSqAPpE7ooJ+MQN3?{hu-~S4kFj!l)Nu z5nT7<=d*a5&s&V?-WBLcs7JUZxQ#}-E@Sk)nT*l)257njvO?|&pRURwY2}9OF%K2Q z+Bmib0q@c>78cAx{XZbVaew}3YZP=E7TV{JyB{7?&-RloPu3%l#CAqzsfBr6ryJWm z&X-)<#tFy=es5}bwGt+6?J0|@nhCvgU{#`RU^O^N77$RjZSS^e0YW3VcqWscf<4ti zv%`=MtPML4XlLJS56Z`xx6KH{T^E;l!cfT?y<`<|pR~!EdDepB(fw;b=^566tNVWU z*y#5YHRgo00=&733~Zi=jOH=?5{_932EoBGrDF1M4V*Ohy7ShTY|8yl7$5^2r}t#BwOI^oVvtA%R|PhdH;4LkU{RN+(>sD7%jWv~)i zuAO9L)(vCCg$-a%C`|_GI5gsuhI4>c&8H`jf+RM9cG+_k2#gMS3Ge+gaiBGi+%vRM zo8oCATGxGbt9x(X|g;>-l zO*b%endRT8ycGNU|M2a|fAcLUEPwj=-vUSIZ!bCL8-O!G*7oTugSRiqvezZOqTNfK zzLB-#B#>SbL0kWeRpUy-eQXnmc3VQJrG6j*M|q-UaM#TnBk1o?C-kjGXYc+G@F7D5C1cPWf?r2{}FbKEz1Gg29&SqXw54e9(I_#=)Y zjP!s4d6?L=T%^No*z%9ZJ`jxb&*}q+JLTHG=Uk+X%hTTL$xpSW3IqV&M+hK>c`sFs%gqQ9? zLR1@{keBvvy0t^sw~lMvMiTH85E4<=unS+@`Pz5nBLP~my!9L5mFX2~R%C1=%V}*e zl(d#AYKwfLDXVg%%tERCipmk)YxgD_k)a*y7lE|b;E7MrS5EklF6M%D z5Y_kUNE0QB4a}z%2$MUfUgLJN6{DZZu>C{7%p~q9^sOQd_a9r=S&J1tZwfYllhepg ztB`ft2&zJ6=#%?HZsX3i?o;ExgA?lOyT-Z83-IHV`88_gDXNB{$RGZP6;=bqX-IkBnxddW ztg{hg(`E>i1J;SF)LOZI^>A@q?#-tQ+bZ?pkTj2ajhE%zUO|Wpr@!Rdf*VC=)!KuN zdtH{}Eu#`CPZI7wyCmE&ndbWsL7e_KL4eWlr(gWfz@sz}C%rpI{En1Y-FJ?sYPPA@ z3m$5;ymrr-jjpfIrI|#bdTP&;dxwpV4@rsn8J?-^o>?<icm zMljc~iSq}i%v-Xvb+s;shMKHmRybt!tW{R%ZzHGgefD_ojVryD(E=q=g&%P@6^{%v ze2|4mAv$Dwi^3pNB5-l>hf2Pfg=$-hSnPW*n*xz^e`I7kg4apdd%8x=d0*SG-wVNYp=J(v+>hT8-n> z?vkr`6i{iIaregHYtcjR1V)xg5VkE3OZlmNGaW z=?8UzwycwYjA zs+up3C?#=`mOlHRj6e}Y^M0s6cdD_zST}UP|B$z%&a{(Eb#cay4Ovkuv?ec8y#~vq zMuoIHv$>i>R&?r{2YS-uCDZBV0Ve24+9xZt<(&r7MhIb2BewSaLl=?te0wzwHdO*j zm`$adNKE4#dxMN|$)Iw}_~n8n0`!8prE0%#2#D(AoC-v!YJW=R{-F zVpKcjei@Y)AG7e({GG{3iCN-0R?>2j*plEyMR2TCmNw1O{-cQb2W(co^d$s5{rGf? z=IxM2dJTA&l0)u04k<8%1> zU1R=DOkXDSS!G>symU%e7Ia+>xpK<$B^R_%bT$Y#FNLxC#^2vYlVfbvFIyRWb;b=N zNY{u$m}(!{M|g~WPxiUKbs-avv`7UG{RVI-5eKYPi}#-iwEQh>P4(eYY!W!j+z?(ty2kq6>sCG*t)H$5a6)S zXb@uG4DQ8vuiFJcnu;bG~ z2Km)s4TTs%sYc-bgI2W%{?qkeLl$@JnpIOShENd#4J9A%d>cYd`5TXFu+?kG1L^uL%o4ME2A)^3qI=h12bejiNUGO4~~t6}%2 z4bZJLt;y)M^@vQE}ir!<<{L`7Cc#fLndZx#M+kGGpIhE#K};-kp7 z#pn6YO9HQ-9Y!%#OtSpW`eOijsv)I*UiJM3hy+Q|;vQv^%8bX-{8}q_OIiIGrtuD@M9!xb>|utnS~a?*}PJ_;xD zA09iE?EiojjCfYf9{Xe4l{I$m=we{ zSxam6jWeUA(vwE4UwgOf|4hF;5c|@E*HIbijnaViQA#q3itVOgyfbe&KJn{CWg{8p z@45d(W+F7ewuyV69)J0fF+0zeU8mrZGIONMqyGy?=s-h{qQ#+`ByE@{5sb+{6oQ{0 z_Iq2;r!v-5Pa#ZV?6%*2cKs^P$FTmQ%C1#(2PLtSS!CKprUut|zR7xjHgCjRKf^=^ zcqx>Q?6`GfUfI3sx`=4Ik>!jxn+VgIh75=Kh#%{KHT7NV^@6}`rK5A$C5vYo~xgKfSmmsDpYOz%h0+0meqVsoiBR+JL5`~K4)K`P+}c0Sr^^8+VV zL^KQ;vaBJxRL{aQcq`AxX*nk%k~S}X5i#lSl!prtiP6oHoXBs~Bu6B!?xd}Y3#I!( zDTy1s2ZBE$YWnT7!V*9mrMZP8^Y+xtAPtqTg< zm>D3Yx3P65zkNDcwSXqUh}AjcDVu(SQ}Q~hv!gr1sXLnRZpXJX_KVT^*Im=&dcg(~ z4KDjm5Z*BkBy}Dvng_wu)Nv2v!_VZ;f9@I0f9@GR003^zpTYR=S?<4&6`SzUWcdeA zu4AZQj*T3k=unNk(>xSSaLIhhlgFEq(#cMs^#m6vknp$!DN&_d&YlwLz}{V_>vS^v z52pB2ib8c_Q0l!s;3+YIWb6N!M1_cUp<3|YrUYm2oO_SpbgQpRiK>9hN4(Ao5Jr}h z#UHW<*y8kWknKKO%X0<1jhzR!cJTh@?({7d%8E1iX2qo&q=~$t5)|k}LHjD@?*BD$ z#ayP}?_rVGI!EgS0#`Pmoxl}Alj8-a@7$#iE$gGgE_2ITFJA)+nJU1E@kmjc1fg#K zF{xo~Tq@#8)yR2BXdmPdIYmXdKrN>wa2%==DcCuO2!g7&b`Bp$t3pQ5fIiNKN;~8y zPsgz|9zC^eK31ZgPT4nzZ@**)zncw2!np~F^z((*(qP|E}im1TV$naM`Fp2PP{JL48r)}iK7xtJ2Q>cq+}NS_w7`8;{Ixe zg0V=9u%z1-Fv;jo^#>O~JVN-*7%a)F^@+}3QyKh8f4Ob7dyZ9|O1Jlswvb;u+)f4@ z*2ecw$4+TjA5d#bYlh_u*h5db3R9Qh>A`b@2$>7@b5;*OcZ2Xt8>`I=XPll|sJ^gG z$}+T?p~~JjhDDkJtf>f@+4uB@eE>o78qYIMR*)zf4Ym?)LTES?@qT$dHe9^)W5)w# ziJ+FVHVSlvXJU)E)nQCd;&OKeYFg(}z86PyZa$wuS^Xdo*_@P`GN`+%RmmV-Jy@ev zE!D6ssRj2TL#x9>M)2VD7KM!#kPa+*Inp`&se;AGNdj4fhlIH6Y0ZXWQsyr2-A<}VG z+x)KAlySdkW6ZiTpu)@gLe`|@H^ZY|X@E%cU5hQQTp05!e|gkI43)>h)J2CnEkP-| z^xLg5so-)C=BydG#F9d;@kMM6IwG#!VjHaZ=*lUIhsBJpDu8sw*&Ulxr|lt2rZ#B% zgBLqeeDK4ePaw(V12Ap?CKBBo*wbei83l9T{{FH+%AbgEU$odAe^@j)Nq=W8HqQ}t z{@PfY-;fHr3pFxDPk5u?`pLn;q#gAh3K0TdM!P`Kbn;aG866ILq*B z(%Pgkbi-xy&wcD5xS|9ZFO7U6IT>y~eVy8~=e7BoQMcS>n6X z-LiGXqR7X2uaoBxWJrwEO~SzKmHo>NWNI6N$CuccmSVoV6Wn&-Rr4{*CXNVWkku-F zqm-!_h+p4RP*IOcylkZ~fiEx7)FQUWOK?H{_?W-q(o|sblMvwPKd>M<)g}eDyc>_6 zoU$}dslI}B*xx#r$tl>8>5F3V5vJZ4>_=SML~D>$WWc#hfq2rKmDCgi+XZgAaX+fmW82Ks-uQj70WsN; zvkPHeDZd5X;=h}fYgSzd#(1|bg};i@^vu+71P+dC^KyA{caivc0t|O;+5~4db@MFq z%|1(U)e?+qB$D%=Pbr%37A9&BOJ+!e-b$^G{cgQRZ&bVi!0US#r@Jj$Qo(nY+hvAo z#SE*+-B1T&#>_bnEmW_PmI_@|0!#|yxsB4V;oB$CgY>o&dCSr^l3Ii9?QzmG(MI1-`N0#+Rrrs11JJCi;|Tn3IXMjI);|Tx?qBxPPl5Vj zKjk|7cf|q*6wGw~kC5Z%@om6OrabohPe{Ze_mr2f&q?f@j^>ND4rRy2b-MjDQ4-%Y zciinmELi{IwFm-~V5H3$g3RCz+-zmap1P$0({jvvQyedWIg?U4Fv7MhQHYNwZ(giu zBQ7%=UH`36&f1}={Qy5T#hBk<8b9(pyr|`SUT#2Uk|j;F3_t3R_;IL4sr)#6ibCGr4$*p5`f*NXhrV#YViKkq*+rm*_9H%p z_v}_2+GZMY?#pLSAq)d7YYTok0=yj`Fay#4M&7!Qh9Bmvw;uxz5?{1KBfP<#=6&ngq{ zvE;~Xt(2lW14BS`F9x-nXRoC!5|EYW%#U3j_AA{;^+tCgG8cM2X!7WRFZ}A-8L*1U z&2J^w@U)6s98sbG>Oz0d4`Eg=$8Y`v$*-_-Lt{);q_Q89a2|aF!*&fh47hlXqlZf( zX`y{MT{W-y_^Yg>eR=_1efttw?y9DE$!IuOya+yiMN;@c(qWRKkMl8&DUTnAWO5cc z>3FYzv$$OTm|Oov>n0z#A|cJjS7|gVt%Xbe%kBboq9j?M{T)ssO&ZxIlkACUM4LkQ zx<+7~I>ew{Hif+RhiK*ZIB*WiMWb>DTSNq^EDEpxp-CtLvhvX)$tM9bBRye;ZaRPp z)ouMo!} zKb#w&VZqFl|L;i}>LC#M^@$c)`meN|29E0hx7SjaB#+&8f!&s4rA3eYo&EyQ^5$4Y zpr9LSNp)0+Fz+i}?yiXFhX7YD;zM~;a@aDkEeq;2|9c)B;`@VoKc-HD)p z2hL-)^D2z*VCrR#>6aJUXD@dEY_7{{_Pt?3neP->Ca-1;tqZK)t?ZN#cDtyU$idYm zNRYwVm*)mn;{nX)mXl6LMUE)7P$!$IMP2xq=#?O7QQnQRmw|==ZPwC7Vxy_`)fQkO z9W_2PMF4EhQ*OmeHc6vPD0E0`@2D;FRSks%X-<}eyna;BoE+#~SM3GvLPWZfYT*D^ zO5{;|b=YBpCU>z950bSj7FyAl&B;v){&FSGn-fk1>I1))A_4C4aWc4U6E*&$5sy?{ z`~n35QmM2qy%N<_oft+yw6+so#;~LPEYaNfv7-)g zJg9>8G|vU@x7V{X?h_XqzzG9`B-=@(mVru7XPimfpMACQSN0(!WO0A?4--R*xg~;% zZCiTAhzLlJdT67MLu&wSdgaar-JQ&w2I5VZ1Uo_QRQEXT?d@xm;dRY8Y=dU3Nltx; z(;1~aAW)=njn`$8Yoonp?qqGnSt3Kl`$T^1LAE@A`Z4u!L+OZ&)=wyKe-%b6-SXsI zQz=*Hiw<}4>!~C^$9-|e-Wu1vIGtv;mM1g8t2d4|{rwZn_rUyki(D4^tws;CekLhv z=v?R~jRl(Gh=BUVGRf3oMzdQa-;Dt#n`NiAJt<_G8}E)pfNF7cs$3ph4y(0x@6~28 zVcQEk0z$rs(LM!JkN=yFKd&KbOm-rx7CtxVN166i@E>S5{|oJ(3IP0DFtg*I$5cNc zEte?Yt(emlDTJQ7@>Ns#tf%=F``{fF`!^9N#EH!?kZ? zsxJX>qa-q?Fl^FH35LxVx0rQwHEfqR`w_hBy4h48|N0${etYwIHh6w7KwedPcFX!3 z)u`^3o{OPte|HFtBd~P-vDeUcEc*6gkm%;|b&~tctBy6KKo$DfNxzhi_NX^^--%}~ zz!84PP9Mh1(><^su+jGT!XnA$3vGYMO9v@Rt ztASbAx$I9v@`3-eOh)8L!bLibN;RZPmgp3|R)HIFq|%?>cFOx={aapSeXrI(ISab# z#H`ARoOh1RO3cIw)r`QF26-UyeWo+GrQq~sPd17HPzg5LHDVnddQF+n()%N8V4~Ku zb$s@OYY8FJ7xE=dYjc{<9u(c@vdo%|+{!ATT%&aeW!ErvLVA7032xP}r4QAG7CAt1 z;MTGtw~gz4zz_|h-NeaCKnl9(P60GKc!uOSFbCJ|ecn-cek+psCnj{8YT}+Cmm^TV zC|{VH3jKonuVBk%mPKX;?(9TkT9uG*ZMh&4Oy7*hR8gUXrdY);tuBfao6SKhtfcEbQZ&eM_N&sZ)Q#p0j!tW7OV zyD6;tOd^WH{oE9%c3>!P3 zHT~+zw|rWb9O(vCAohoHT3jk$S(@PNiB&iE`b1V7-*o3zJOzQAHU(dLy-u;Kw=R=k z&AAN27?cN>DvSw{-gk@j{;56)g1M{z<2CccWdAQ;tla1S+qk{{|Kd1GF!STz#tlf; z15B1VNmzD0ZW!muGb7cIE*l$1z*+;W0#dIcSfE~j(-GZF5vo>lv%7=M0pp|Bba9C? zVvS>Klm9H&huYQ;rA>%zYA~bJ>o3Qr(hhi%{3~vmeV@q99YWIx(I#PRz=BOmuG6_6 zJKe>2J%Poj;L*mQJm0ocY4}>_`TVve4GxoN<);&>l|YgWpt50}!$HP))L^C42KE^0 z->Nu()&@-@W%zf;2CHXR%*Pf*_ir?vRE<4zp(2_pfgZfNt~Uni;KNC{CBww?h7aFL z&Bn^Td3b2h6xogu}z+5k*ro+MD=XeH=tA$CCh3WY=16u)aW&H{2crbBO8~TaRcz z!5^g%us-Xd96e$me;_uha_<&oDz2HL6!tO)fr9>Gsg>>&3~W&Uqbum>nsZNFLj(NI z^=K@pYcb}95(-L;1c~is(ekO7%>4eO{b=f=KM8!Kqp}C_^-T!ayQDRU&nbwt+8~-? z0G(3`s1Ca=$qJJCP*3s}B=${upV(!DRQf^X$iaWlf?uFF|2iJt{_L zFM(7i;E?H3tH|CcT#PKmaBf)~t>u29mlB9^k#1+T@EWogk{R@MYsAQZ+ou$9J3Qte z=U`tYA*mW<@4dg@usJbUxNuKl*I<-qT3x|_48%Ul7;&>@09%c2$GgxKv01(W5m!Pz zSnvKOx<262`J{h$hy$ua3Q0SeTNMLT|KSXppv46E&B!!z)-Vub+w(Gss8NU*1y!87 zE9H$~eEuTiAN4KG;n%oVH;rAZgPA0iGJXy_^rfZl^kC=DOVl-Wp0|K^vKZ-?&?Ky_ zX!GX@Ozf;e&&yG&>QGQM@HPk1m?jTA}?9R9toU6#3)JsUA^J9edA zo&gkfjM!65K7%;*OBY(x$-+Cl)f2r@m)%j;{}UZZyO*EWm>)3QP`#LCK-!4BFa6~D zR>G9=aY%-VSF1i@QoIoSUlpP-O}8Myznkocf&(5h4NP_y2j%kIV{u&Uz47f-W(V5Kg9 z#g|Na49U?0;ml}$mjJzI(C?rayr6?YE-NjrCq&Y=6)~|u#u|#}fM9R4LX^o4ZW;9< zU40kD$Dbobzpp(4jGK`fk~@<3p|TtaYy!h#>c{d+UQk{48c0^)2QudP;@iV~Dtxa@ z2RaC<`K5#K)ICHDnjYRj33gD<`4OrSI-j#6yg!U)rrFhU#OfHstvc^J=aVxm@1u?}j|r)cAC+Uh&xnhKwAjB$>=N<2WAZbv zbAokHzNtm%A7RIV%h!$HD8(h@0GIRCai9b~O?Af$s5&M2Eko12JG*>R?5uP8+CYAA z653mXIfwz2_x)^_rFQTHfF=Utxm+xVm{OP(%prdHG6*HNaiu0&RlUB$1T8=1nl7Ay zH}+O9&P*Pu)loUUp-KKePIv{E%mNWz@OaG6YxjDeo(X`K&+ivS!hv<|WGK3{(^Q@D zqQ6&7zQHgV2UeEEs9O>o+O7tWF_UoiY7`tc=?6}XIA{+A%u5JwNrMF;fBG7XrtHr_ zNLk^2BYyq5d*xOnp(f6suK6Bm6;YTUD#ujYH1-s^QNMW1PP<0pS|DWE0rrl#v{S;J zK{nfEJGG$Urya&rsS|-TrCi|9jwn4*KuX}1yl|Wg6b;oFhV`TG*>R;vkL9=3{ca?R z_5e@p2$kFlhf^`OUE|^%+J|;d1Fsnpg<~hlrk~tLD=5nEMWJVOSwgg=u7b65I?1=@ zMdFZqgqNczIr$Dz_MNENnCL&hsMmQIO0j{^1a2x0@tJGrst?x?*4f+6KdbnyMMWi< zg?K^QJW%-N`F#}c{7cpyXp%P0O=Ade5%6>BqK;4*Sn-C*=jTlghdB_*cM0UK+^{|$ zkQ$gvp+0C*n`G50Tua7gLtCMHoxsyJuNmxxtbg|CXHD@QgGLj{>{C=zEllkMWgZW~ zh#ZhoNG3s7M+)G&m-?np8AmR7rFd)^7j4MhI&?w$ zNno6!yym^914TnItmQe3kgcxelF0gj8kohhNksLOyK_!{2~91=*COPPRtuTxPt!*R z0E`Lp!(Idv&t^2{+T(N$Q*6_DsH%A$h!Xn+KmtR%|MBQKy^VjpHsI7aq?(lZyz`_w z%Da{AQA??bFaG`7$P+#%r5U7qld8nzyvBC1%@CDI5QhQ5lnP@V-~_j^A*p3F#;6N3 z32Ox)Hb35~dzid_A+2mG$YNb7fYa*CtfpOg$-{L+cNpvM**I7EGR$eYgrIx;;xe}x za$IdlHZSzFj8+Ah&5y!72tsu+RdT&78&6=U;zo%s*JQhw>N1~Oc=1L$xZ8=`_( z(+c8eTA!U*UEouY)dI*RtVqSl1KhcyUR*oW{W&su+qe&v_f%1QF3U~DZZ=Qk?iDM1 z%s+}iSb{(I={3O&e%U>^l!&B06qRp)rV0VE-lZu?bCSw(Oz(gql^P)bo8C2$kQnF1^?!Z+MTZVWqdr&S?dJHpUx6sk&ht!Tfgf3dM`xMxYIR>c zXD1*C^4qyEaq4IDMxxUnSD80(>H+KL%Ld)t44b3y{! z$DEXt!ylzO%ADvd%3Og6ZPOcrOVlUG3tuWhkzxW^_O%#4pOMX7^4>pLo0;%3fndD^T0 zqPCsEgCZ;xdE#hfFX|hje~HRLceas-;^v4>V%sObz@8nK$n%8_n_jX7LwrBPL7i0c@e;^i@JV z7lMQc37VSE=Uu;h`arGT@^^Xa9)dQ?CBFlAuFdtvUVZ{tpj$0`zS)hYk-@I1Ih0as zVV`0j>QXaC%y!zK`4#mHKz?P>Kkyw-FBU1qIok+@4NK^0KHcJ5{G2$rFtP zZ1Wgr`|qhy_{-&0^X}~YncHt`enHLLBYY=y`z3CopK@4*DskAJn*=NZ0M4&;U=t67 zdH`xcC{esHJ~NFwE22E>WTL*>@23myYLQ5+XGV=b-V3~wTKrVjHDArZQGPpf$x-mnK=am~WHygLVQ+R!2Yg3xxj8T@;CK&46!vqjs zFAcAYp`aRoC6qa%c^1R(O-!28>c|`wq8;&!EJWQ1V0-B9G z5cQkOSi%Y`Qs!ZbSM&u_$9w+^!D`^efcEJo9INGcbu^H6iNE?{p1$2_y91fo=It_% zX(xRX95X5{o}JEO5lZMx=``M5)z{7MH4!JB7bgu#C?;e33^;|EVtqf+_Xdd4=ZeU- z4GS>H`LiKiA;cH}exP@x){ld?9}uQOFq9Xh0Y_UjZp-kFax9c=2`pO^BK#?%uY6$> zyk81`__KV@n#Urqww+s!Ka6Ugq9ZODNXTYCnY{fsk&9aW7@iD+f$`ExecLk-&xBA( z)l{Un7B-hmRz|g|`PW8GX;kK&38&rhO_N4Sj-36E>Jm9Ky<7ZXR~wyj~FzSwUuL0G<1|j@ZY=M_qg5BG7(phKNkHu z-av{GskrMI`QnS~;icIL%3B|}2X5?*uuM~>;n;wlFsjXlhXCHU@<90Up))s`_) zPU5xm)zBrcYJOI`S&-&_oYtv22sR%j8l5;&@AJ?KUG_-4(dvtJGJfbJ6)U~?y3^;| z9M)ZCLgbsVUgzTQ9U_#}3n(J3!@Og(1Rql_?|OlUTDPvTk0OP)F@>P%779=YgpN|- zDx#uMwF;I!0BN{k$8W~$d3NU{uZ>LhIvUnFIso0JF+F=D8>If#Q>X?jsZwEdIt3!v zG8_dAOkl}GW}fnQxE`yp!Y7@!>sVCey5yVB?+0fzp~Q{2ybYNYuah%84lK}g&-}zC z*%NJI>I<0mJ|xPU34A!yTf!$A^$QgAlg9h2nPbb836J&S-XXfdP0-`K1QL`ty%J^` zj0o0<86MTj+J*QqB^KI&ZgJUEO1COJ94m9(jUUF?Gh$pawJ)`Y?_T`Wu zedJ*)7f<}DKfY;E*fwRDd*aTpD%oMGY0{n5$9d%69({!3=+cG5@Lg87>Wlj<;k4)9 zG|X$iTAHv0$MZS2d~a2)=+PP1mAVr%GUz(29+JYyCW=s1tDRPO9s=dS{I(mBI?yhg zdZv9z{zS`%G-8Ne8i-3mq9&4HYEN*JXP~bas|rwXl-rV#7;;GXc**J+w;+SPy>2>} zduw*I97$#E(Zs7;p|nCA-68n!-<^U2*B?U&;I`PEw}-`MOJH08E!d<0ly7j^a}?WaGRh&YUWi5R4VM!9M*`*(~^-o z`ee71GH|LRRAjK=6V>GvZF+;O_VNi>dksrs_nSZDIQ)T33RD~VZN@f_HOYC+7dcI* zR?LaTmCSpFzz0hKX**|{;*KUAfzO(d_ISxXh!HptOBaiIxRYJ7FB2_@aa*46S0>)@ zsUClBnUjyaHVtnrWcD0H83?P=9@`eA`-q&E1AUn~4SNE28A} z+uhhZ;H#4nP)sjCjffQK+uwCaDJ&!~AL%w(33Kx~UI98L!gENEd<*Ct45t>Rb7;Nq z?XA2t5x$xT-^bUlJ#IB6nQp^UHE`~5^v3`b6*F7iMhI;`U3_I$em-3A)nciHb$^5 z7q;9nvQ$xya0XM+V95tVH;9(De-ssB>h@^lYN|pJm0ns`hXBGf3uar-kY~2dA?!Bu`^5JHtCLMX3M+~tlcUd6d79wQG*)W- zICqKqWyD`+$Y!O8DJ@mjcP9*>L&^Y+J1VYRq;pF>fH(TiL1a}Aq7@hS`v3{Y8%mti zlX|O15pHhg4pnA8%o?Y^{^8a?kb|L8TW_jgJtn^ZeL_~z}7@kbDW>;5FBHT>X>3J#XU&!IsX4n72-VOhFtj&HENXN2cZkhYEhr?_y4y#PmW z8k;!Jm1s_%OP~xTl~|8&nXgG7KDb! znbqG2o;27)we#HpbnQi2vqAQmyIQeV0o$h}eBMWQN3xgZx}mYBNc(kr5oci&*WSO% zf0-MSjZY)C)Jh&HG*cM2kd%(^;RSHcWJ(*Lm{?!Q{u3BQ{{{xZPhcboW|aTO2NpP( z<^9jPi-*mFbc4fnYO|e%tRKcAA5%c^iVfJn%xMs?D}FB(*kkda)4@{$i7k#unDYEW z86<(7sxWIS+}d`xw%fwJzDM>t1c)^?_c$EyiJ6 z3yR7d@JbP9W*rh_XCHmHVlE{N-B<9GJ-(J6gb!ok1}78s(~7Php>cds?~fL2bj(XK1j zXMQj&t!2icpI{JRR7)0Wln<{w57zUn;`qA;CiYKVXUKD#(U3dun6wk!Lu#egWZ+Wq zrL;%?hyV|KDgtd6?4c-md=k=}6^pwJk>_ zS(sE+uhx2L%;gCmusfu4<>h*iMGvc3bToy{R|H=M+YN93u-L$=a6%53M&n=wVkGYv zp=V$9FyW>T2w)WmD=SNO6z?DBE72x0SWnm2<$waSJ0i5CW=TxoHGM(Lq~Pi3n}Ic~ z93smpi)x7MlfN2@`YXj+F^f#Y6(-RpdAEY1n3og{HEnyFLO#AIc`t&9K#GK-;#lsUls4^n%Ht=Tgbz++m ztIhCjktYbj2ZajAdE)34oxF7XoMWQxn9rTRLlyBVX6eOP+r@%6%?0l1l?2U|JKP;>N9nbCl6THX&j_3YU=1%`RntvE1 z|GOIE#~>H=8@TX}ngo+==@V2P>mz>-hiNcYkLPb^b1@+`+WEbVDRY#n=JRLOc~2-{ z?W-&GL+up2A-n*DM0M5ycz5&^KT@1|)Ac+o#P_%iW}@RuYbYERg|A72f1#KZgx}S0 zTfJ>3p5A@;tfgRg9Ukpl59;F3@dE}#ik>~Uiz~#zF;q6fbGaxiZUz2dS$j4%vBu7C zoWt*FH3-tCfGG$rPKtm~X=;}{>8l^Z7i3v0)Xb4bQquGZLc;FN$CUlKP-fWR9dO#vl74l= zoWzTs4$)~^w>DkvTw;N-QdhvUmL}h-Mice0^lYhT=M_2p!pX8#6b3?Sm|J`H(}~}1 zCgG{X;$Orv0~_V0Faga}@h2*Hft&a!>0n7B(ngB|^(P#ZO>F~nr-cG`15k=Tv1r?9 ztr}VJE_+gVc;SlkD{1*B!rpM;db&VaTI+nw1;M!6uwacpXW81$FjDn(U_;BPw9NWB zI4$Flmop&Vix9RX?b>U^T{P`FPybvsCZI*`Q_kr#j&5Ww6nx$3FxrK}oL~z{3$L_- zzQU$1F5_5>V9umJ9Na;g8>3H%;`u5Bys+R7nYBq6Y<&?lE7ymzpYbiItw651b{hvQ zp(rXB&T2V0M80K_8T*-XvGGbv(1UzE>;D2qe^FHQ{xAM>q5m8Id;g37T&n+r|6H1X z@&6MLKzTn)3Vz1ZKYM$AY|Q})L$4OE96=SigxDjP3I|8w5cQ$k{9Zleb*Ue|{CR%!xUchv%!*RXJ@DpUUy=&Uj0ny<4O82x0RVmB86!U zSHUORMRdbr+>cDPdL9Vm!V^k+F9f~Yg0M39Pc4BAt&wBby>CYjmYFo$Hgfdpebm*s zn6ax9rUmf~ec39O+rS@QSouV{Y?mL#R#9`j>*DlBh|5j2dm42AsNhx~K<$$mI(LXx zJe*I)$f=s|4>0d=K-(pl&$A)|)@yG5xQZC8QpOS}2hAW=Ga#k<9x}=6toZaC1jH{|3M`cDm4jvvu85b5v z{fIxc3MHl|EWV`;gR3;mR#4^fq7I;SeE?eu(3i&&)E7h~knk*eH^UrXkT_{kYEzC3 zM#({R`ro^q$96{6xGmv6x@wD?k1|w+A#IJ%5G1W1qgZ{_8*qAeLo&$b3#(}aeT)DB zTxiDm;LH;X)bvvEeSY?zDc zbR(x|G34yTo|V-m`6lAgEutmB#`|5kh{hTagk3ZHvI{DKnaym-^iCMs+LN#56PZWV z7epCmS@0sui3;NaKBZhdt!m?V-zg3}LOn|(M_ozEu7DOF5`JCy;UFcJoz!&wN(kns z4^zX{WO5qF$x(drl*g^!@lL;&>po2)Z2HFV<_C1ivek;bU8y4h(#ofgFn zvN6g($d>GSi$hQ5z$hQ9%*#tk<=cj_1F8?AMfF4n>7y(RUU0FElQ_|!0bd6>x z+-b~BGaLK~uZ5T;J5#AU z9Suo>`=|s3178TrAw%tap&-oTJUuKv;MQcFnz-u3-FAP8vh}XISD1Rwxv~PlBASxW zmU(T8GD8tfDwOl(sPGf&#_2|&Nzd_q@Jp*BeMlGzliZK^ydhKDUB_|b11@+HLtJfx zH4-h#n~Tfoo>>U-!?l9YBf$Hv^>%QP&C_4h)z14m$*nNwj_-%PSq(HNp|J?G8mH>g zpo2cH)*aKVSGOxXEo+Hvk27xfebQ}9-X;v(Qi8+j@VsYfwUNZ&^Nhl2b82MUBSHS? zbTS&7G zNDKsyz@crgq>ko7KxfjBMuB49{dHdi>ztfVCvGNaaE}(|B?ANT4?+_E8zKM6eGUH? zAx-}xXaWzT^H_m=mx+AS{==ycU&2ORN%8rI*K-&r(NqH+uf^J&ed{(nsc$ zCPGmJY5o9jnm#ah1iRRt1&~dCMVPtpsC=F#P8_%2xYgo@6;_CkHbu$stLwWsBo(T( zGHo6j?Ei)tkjAV$^WexQNH^1QO53v!?S`KJ-BpI84im+N51X(E=afdBeb6CV&|nF_ zd;e+m8&Y^Z5mfn)ia7qeWeb{tp+A!?8l25x z^fdN}j~*(=)Cb$_&nlsA-JE%v>?yG?5m;=Pat&UHtEWpvodgp>QJQBC=nd|+A;R?G zJHl!+c4Q4oAGV?!@s@@?S(3F0zkQ1^jBx__A2g9%_WLj7zxp-i_Q^2f{`z!pgrh{z zKZ(C=mujy;TCi#IntfDSk_z$Et&Cgyv*!JC*em~i*te002A+ZR+PI5Q{UfQ2>AxdeiV=fbX?Gui(DpalWXM$g)7m zf;MIJbws7I?P`$4hPE~5|&5K>7w~LD+Q;+Zl%9efF+HVVifj1kdrv;gclVx#qRmu^o)mbOW%IGs`})$INsEV z)eZH9edbAmN(s5O({Gl=;OB<2YWuW)VV?o|=HuXw^G5&`aMU`T)r z4*$jhcS#z(#l70sphn=Ot|F!Yc14R;oER zaZ$P=N_Uvg132PuvRxR&Ra%n@lpk{L=7>q2?(lPXQ|KA9sM%LdXmZ0SiWKrGE)@A{ z>D8JX5i-RK1M<>_S(9N@qQfst-jWbadz*(-MS@m_|De5UfI&PB=p;QwIH0-;~)j`V`qU{$_eb!N|tA; zYKcq@&zRKfOzWG3)-5}2L&4{E#6&kVUVw%i)e8+EB#L--H&j|)v1a~5z~H2giNF-HT(e6;^g_BL~RjIcbvnZ`j22<`wqR9xHd{xYKO>GjLDN0OF#$; zX_Jw<1XT6(y9A4hrp5ZE$m8hjk1yQ zV`8C&m~igKQ#fer``K zobrooFEF@FOt{s|4(t)C*NbsDk437yi)Ur1ZL5cNii#B-{djpiAOhu6!FPk!73Xk# z{dh;C1YRHOT9`?3BIv9`Fe*Xoj7pSiSV5#13%FXUFC}<^O=8iPKBv7#7qrY(s8sjS zjg|w^e@Gh==SH~UeT>nHURx(adeclLJxmo-R=j*7_}gh=1Ereff8cFDN)Uchv7fl) zK9eqzK-EG`eFdk5&4;6O3$@nx>mplVF03hij`9^t^iT7q@;0tBVX}Lr+jl}?{>gjk z_`In9c<~aI(@T(4f5&XaE!mg#yA)A$+kJk`I7clmdd+zyeyNRy@mSzALlLg?vgddm zrP3gGCHO}z8=)e=65=$&0doX+CHHl8zD z(Vjg^5%WjEtDc>;wj6Lii2ATk%xQl)UC!vo`?!lx4_|VhgNENSrZy=`>lyDaSs!4{ z?2`F<(PMQ`QDav$w%;XAFW?-`2LW{i3nY~+KGkCtt+Cb6Vd`?=rQ|jPw6M;N73^5k z!u~E!`SFLkNQ*cc=J>$$uTxUKm9!PRX;D1F$Rj7wR8kts<}U)rdvB7Mlm&k7+7xE~ zK%vI{N5}jOfJf2yz{6YoW)MbQmbgOd`hNn_MX5U}bXvL`sAYkd&vyCpJQe3K*0H&b zV5e_MFaUREC;o18?@|(V1EOUc}!d25)@U<^M|>qqyZD2$>dc# zdbHmg4D}&Cr2x<97V{iY6E0=i8w(W0(+y~2rTy*7qF1?{VTO*tUnPINRjpp`c%<%9 zLOL5@MVrbWvRyTmY}nt)@{*d!-uY|j3aF=72BaKEzvd@6SVi`=m${?whI*2PErh@8 z81OXPvu!2H-war52^2KX5fd2lXCiSI2(sg@H}rLG^kIue{Stl^b$A4)g(uGzyjdO+ z*o71K@$%mLVj5l}_)EBCz^dv+=1b1=`+a9z6XSL&HZY`KLQFDJtyeKrf0D7hITa7= z_rTgm7Li2l6Mk4bBkq}=9}tlbG0W+@y>_ii;W{I+MM$=t(|{mZ__RK6dBwt~w+YrQ zEtdM7ho#4gUExm7ks|fqaU`xS&Z~3%h>`Qg59~)^+liU67Lbil)`m5j`3?~AU{cr1 zou8rIGnpyTOun%nbV|;*Js-{2C$Ie)Rp;azV5U|X1RdtCgQGBCmb2eJ9uyb}kARPq zp2|?f?^j@8_Php44CK4g-L+SaS}#~m4mIc0Gz|Q*ctD2GPBqc3z2)^`?Oh0f)X}pX zQ#2t1@0xR=%??K8DWN6SVn}on|Kzt6)1i`9`f>r9Dm3g#Vs{LXC}OKxLdmGa3ZqCQ z7Bg5eVbO0PF+6B`M*ExcnqNIam!B;X^Z6E{+*BjTtpdbx`Ujj1-;lDxqN(5V;=FfK>c~5nCt)lf3JW(w5 z9jj-B$TDe4)bP2nGbKIyK&_k9E_f%kyJlG8m2kEk%%?$YVPwL5fRL1Dn0bwH5ia?j zZzrw&FI7RqC=?fJ5J|tpZt!oA9RY;_!{(#g$JlFqz7}&run%e^p@V#i%d^EqI<0P> z)*^kkLES-MoO6AG$EPib%Qz7&AcH%jOheQxacTr7PjRO02{a$eC(dZoc0Ve+LKl+) zyJ9L;6LIm?G%!IL+QinZQjagNlGgg5he-4Fqbl1oe2xz1^cpZ_PZ>VOSYK4V9b0<~m)SFKa8=HbJAdSibBwUos`x2EzE1 zJMjb6iwRDgx=|*W0J|+2RqgPh08;Gj-C--O{xrSvMK$qUGm6$trEM8zbd??pP1Pfi z2zY;O9tvugsq)u;FX!9ql(57g3tHksePw?Zj|0S!M58H7>Mw(|++B>EXI5T&NsC?W zb|s;KWDoap>q&eCarbH9PSJSF5qZYNY0zw3p8kvi#690o+CS)PtM$d=2>TN2Wo z+xVN$e6#hPH4-1yS5kuAx5UT0{NlC2&be4DP|%VJVS5ShX+8l&i|qZJhN!rJN-0JQ z^X*SGz*#nN%6MW~*{u?O7*S9fT zcJL(C{y=21c+O%KfDqUrV~!WwQ{t(5MSs+M2qP&Q?~V_C%+^~Rb!^i1-it_#pv~;) zYBUKN@xx5pB)1zZB2k&3&~DUZK!p8DE)v|S96e=Pbi+Eysu-_T6yuvjr+SHY3bqFh ze`E!pjAR32_7-~>e}m0L5BCh*!kYb5JT#o^0A_#U;WwCoq`5slPgt8skO0TUL9?6u z4igAOH?bvTtgWd)2Sn*L7S?mtzFZ5sMXu^$3(*Oj>ZNppfTllQn3$j&Y$kK&Y+eBx z^6<4OI?oEPO56S(kP0ot+v&-Y5A zgZ;El2yp-nx}xQ>ZNP8-wq7d>W^}N8g(bkzAQG^HR|yyCw0NnSuT{xK+fVU!p{HXU zV0TLx9h$rK&p(XBn~AKJ1lr>+Gwe^m_n*Wug!j98t9cmiAlqQc zO`B^9f*TFVDxYc3Dg}u4%y*TBYAt_LP8~r&v3(A^g6-evISzB>xV0u=hc`Jn_5LnI z_q%qqRic~vqq1RRgBw9M;pjaYHTz+oHkc9}jmWZl^H)}(hewdA1^i+-ghqjUEl%n3 znwZeq7$gg?bcC-er}6JL&*bXl5(tz0YQ;OnD@`lXFFmVH5bO^98gl2-vUL2&S|ZB*XE_82@Mm zZNig3!p3%!6Wi%F?sFASule}y4yB6a?2qus`UT6TD?qlSN3;GoUX~hk*~_!o?|@{N zSGq&pIZKowXzD7lk)SE!^<4L2k|bzAuRnL)CYsLM^Kh-sew}^c0g69KRPs$z7Q9rk zq9*6Cm2~}`(*f-3ON3zZClvfhEWN!dv#OGZZPcZqtVl;|#%TqVw#c`ioN?;oM(qbUSx4 zfUnok(9TF0ZXLdP{R=zS1N>0vwBuoB!SiV}lmRLrab}kU0YM;v@3d}Rq5+Se5Dz9O!V?Aj zdNQb0Xnam@&Ify`p#s22Hy}!3mu0v20vpq_mUtK$*hbiu@Lz`I`wRfxoT&B!vyi1! za3MM%3VZgx)ER19MHmYm9uBJy+N)gonK@F1ab#U`vZ?mSil4Vv4ybZ525cHC8iWOL&ZjW^2zp!_Lubo6@`n^5*a zoVthrS3;myiG=QoPO!-;z}7#+(skCXpm!Knv)Aw|WvC>eM(eSa5!AUStN0ZOlIjn;B zG)g5_$MQK8d{POAa>DnWdg-j#XI!X!bWelEAjupbp~Z5XBc;#$qfy`YnWqrc%4!pm zBG-6_fO^S7U=t~Q&C_gyB|uGVUS#3mMy*pFSg}Irj+!4cd-;-|%&bBr>ddlIM;w7Q z*pC9o02cVkPz>UjLVVusR~ptL(9CDhU}U=oAEeU6ZNr?cg!fV+%B&>@^9C>?9i)Ry zPx#n7V$= z9QC(aMMXk?Dg})L->KhsCV`Ta z12wkxc3~?+6j6|=p~hmFJoTihbZrU%OaQiy{{BW$x~&=)7%FCPNH%{Cmj22dr#I_e zE&1XM$vfgNDmg$SbR!r;Bk18Yba(I~7 zUjZ94(d(h^M^x##5!Vw0`ELEVUya-^$WcX{oZEuGb|557Fh06GSSCZc2zJRY9A~DX z19e<=*}cp+rkJeB|Fcj*l_^z>bm0aJTjqo4I%3EgMVxD7g`l6s`$%kT>z2@wuh;1M|-`VjMlXIdBxus8=0$GBO7Y z)w1K3M&rY-D6GIpBzYF4;cIuXwE6Gf%YGn4FF=od06ED*#Ap=_HJE4<$2yc+XfTMC zO-f!2jxhZ*NS$;!;CCCQlJ~jZB0~@#`@nJ+rXPE^y3Q+Yjdj`gmn0uy%(KI`?^3Z? z-BwM!3}g{FP(QY29BevQL2WwUu55=|IDP~x>R8W#*>98$1ZCOShd#%7--gy;aC~59 z-lhZ#^H|5Q`Rorchp)h~B#97IOqzbzfP+RP2}F366f38gpJ?NfpJ;q02yu^OO&=j( z1y2aj{(El|MrlN;QlKE(TA}u0L#0~_??kPVC^R$h`EAuw`E}9sfZ~BgnlfH9B~g=X z2%;vaYWaaglnJzsW^gz9e^k!_?ny4hp!_$rl0}1kr~$4IqpxI3iYw^6`zwt`ITz23 zO3!>v;Ub`e0c4iRn66*QrtUl6R_wVdbtM)WVd4|O4fNqe?I zHfY6K49lRdmCi?nEaFQj;keDyt~KsSBVN3ZK5`D7AoP3Usia}Mv9FKd zZYvL;$IM>(2+>h$Dj_b}J&o&7nTTu4B^%HZcM~@&jVt<+0Am4nliH3&s7)6usL1aN zpU$eD4T{BCf`=1MM5Np*7)6y`Dg%ZHum<||Z_tjfv@CktIb4VlgLvm)J zD!R`=vJkX0k0y~DcC=SKy^1-ev*Y;Hk7-Ova!ENDS-Hf^SUtHFlUU-L7HA0q_Lv-2PQY!GNj5f^DWK_ZIHeK@;Wi6{!i3Z)7onnLTgO>zAr`}$8y{Ehn~ z3I#y;Y{N6$I*z^e=%EzH_6rbbREeexC+uYuMIOnVD5LW;}zteHPbPjWQtyBWn}Ynnn=|!+=@nm?BPb}i7A~rA*PGAB;@j) zFHv5RDQt=u6#y)>y?~(1yQ8AF7SqZ;rWZ^JvXb6Ka#ZF~907v_foDCL+ALY(>`Xws zaNs7uk{jfH0yS0*Mb8-n1;PsS=}mZx9T8Zv9q#FZ?-rFmOnu#eIgIr{+|lgu@@AbI z>e&!(xSt(ItUOtv>(qqHd{0tJ$9}+Lg8b7~LN6X4Y z=i7iWAt##jnlpTeS)CRgARYbVvK_`Y-w?%!W9?iKcrUP&Na1}O=Ei3KUFgJ42Vo{= zjCUd(6t*28Vc5Gzu|( zX364xUfWS~vZ9Kc&aDoQo9IvI*Lq9)Bo}d^y7rTO$ZKY}M(?2eOoejB$nf3k{hwZ5 z#$1Kdz6uiaUr)=j|GUhFO}L5G?s}%2_`&{Jz0Jf{3P`M_;;rKmZylv(>FcU!$a__V z9SJSfWZ5x%w&zz*#_esU$Fpyp>sY2?KFh!625G|o#1Ds)3RD|xM{_zKTZeiO1%E(H z0{;u^rswI#X+f_MzURxH` zg$-@#1L>TYyOSG0!7p#-S7}Q;9^rvRIY%S2k#q_z4a5mx%rO<$C^g9YqHnN)Xo8H5 zF-rPUZSxnPi0VO5(5Fh2YXcx}i*Kw~R>r@`D&oGI%9A_W9#||Vt zT1I)MYaI+0&Fn#dGdA$~Sz+p#h4`_$`V6JV(cgMQW3+Q*6*0qyp!LI;QW-cM;d3gu ze3*75SSrt*lXkID7B4B9iGFU=30>g^emx6izNP_3 zKNbXkCa?qBeEsXM|3o^4xBF2a10^2*>fY&`KLNem|pnt{+C7!=A)%;slrN z7uV}NI9?n(7Ra6(i9#FS9*G(&^&HpURufL(=-}4JMBmgjA|ly(&EqD zwNSOd#Y*zHinjEf0qNBhl{UaU^~bAeJNsOFV!T7?oHiJaa!+;b0Y{MRzYX~)I3X}M zoq$KVxN&tQk%L~UQU&u5XhIO^P=o^OtGcuPx=th;rt$`4>>qM z;17XGFR7uPOA99irOS32>-PF>k2!%?V*UK$)>>ioDUVW8`CV6R&P7euqUhH|a2Bgh zL%=0l#9F>D6H($AFk-~=Kfk&gslu%nKUm*wh(IR;nz<0J?oSX()?O3#v0q!>4+zJP z_6OJUq~hhgiEw2wSdLGP|4F3^2ryp6RDd)rzy*yJmAg0%m?>w;^_z)!5*<OWo66caLIzX0#f!Q zeH?V;Jue=o9x_G38x;+=yP)BR4o&E+g^))Ddry|_n-}tnPOin$j)*g}n8Qc~31vjC zVCXw(OTd%4!e`s4xQfPBHoAT5_~a5i0p(MxA)U=aIPmX51o!6-ZJ?K7m)o&8jRvN4 zKGXWYem1CSrEQ$j<32U~Xo>dIxRP%AE(+I3Kd{YXW`H>BCW9`3cQ}z%E|~~_^fD|W zSvo|o^&R;E-G78p`C*iHkXUxr9HmgSJ+6#Fr$ zwAcD9d%eax-&M^1?7}f&)l+BN&+RT(-O8eic)OyozY{sD7)=DoyWT%E3GxON%ujJ@ z6Vv833gvtXbxHR(-96*nMF$QZo=)6NsVIVOzCguRRO8pBb< z0KvX#SP`1JDSG6JM1I>tXKXgfoi#CUNs0|$8isB=bhDDxqW)#HpK+H!(q2_NI*|C3 zV~6rQwFEYTCzkj8gF&Ld0YE;v1GrI4&rqgsTv_hWq!M`qX^Q5S&tZh;MPy$GVznt3RY$ZaS$B8u6> z6Ea<7_D`1UR_n`24K)gHxlOaeTT=*4#x8252$Tsyb1g+QX|WX;mt(B@+1`ZV?);CSEHcidlP z5V}!sL2r8Hut3Pzg7<0E9O#^5S3iv@_Ly-E^qo!7HR;{gxhbUP{?;nWM}4QG`g!l< z46M8dRDZ{FvWr8t?$>+>2S&s6swdKy0gc*qhOug@{{=W@UZIFYPCL0t z!6(aqSB8h^d@|nyV)=NV$b|6e^M}?)HZZu#0=#00EtJTkGp0JkY@Q$$MRrD_(LrD6 z;0*`295LuYAP2^8lYR5S4t z^}K*2CLJ?iAdW>D)$wccs&mxv?}#bF{?d#y8Pb#@|F z0);1x$*Hy*ZQ-3c1C&WmmUW*hlwC|bjwcQ_EfO`$NA?F$!D9>Z$>xH0`f_dQ5j@&Q zlullB`bvfedENj$g@ImLPTP0*PVZz3Fz8mgy1cbu68lhLCTWPq)5C8d3N|IT7!c;` zpe>9|=c)!28q{re<7grM!7}kRT=R0zCG3)9OCdQ~y%OVOwoY{kPg)J@0Y9)9k9s@L z->{7_bl4I~KRFTXgO1Gs*e02dOdCZh#r0yEZbF%?7b93TvG?hP>rbbn`7R$*vbkP5BuPPQp|HQvSPPgiYe;i3q_kjE$tID@j?EIbUK1T#zsi%b>E2=q8o)c+MgC}q_P-I;qsaS;(SPh zdmoxbd)*h47fetH8+bpagrxPpw!g14u6=NvpK_S}!v+#WxLUO7x-?5|f$}NxfFDH# zXbj=3bPcJBr%(iiSNG~Co1Qt2g<9FGt^8mjU)!vL41yGRk7;bDrg7&`hF)Dw;4yoh z#&D}tabLPd0?5`+_+T8Dvx8SsI<6&M(&AI2Da{L{2sD~nGYYdYi_WDGr}t52&h^Uq z{Qwi~U)g)_CM4+XrF#Of=UGiN0<@iXOo5yHRPMZ$8vq4$3;iK2s4orsLsjgKLj&J= zZ4U4pXXT|gg_p4vTgMI1RjQ&>;JvM&L2{fZNT!IkGec2V5w07V^>zrC#l{j0t3#uu zMw6+x!Vu4^C8^2*Wf)03=^HQFs8>{YLoJ!`gPYX=p+kS_gX=@}PN+umaRsQ<(gu{m{CG^5aQtAyEixiUJ5JsSC$ zaxMH0Z?3O2X0%358nbU--&hE_;b(3;D+haI1Y+21w$={qhj!}0E;)Hg7Jqv#`=8#7 zwfe{%+Kc9~GfznnehcRQdB7y79CZhQMns+YnAQInT9yEu9IPd?l`%KqKMe~|LLl+^ zZmiTxZ66ZZI+!0*4t;{HS;$D}vMj-*4MDV?Kp~Defby=^6SxTPp|sR5BqYcoZd1a1PM=B`Z$2ra0Zqj>Lz8o!6BiIqqTdu+QI0i=@Zs_9#Jt{ZQ!z z(s9ErXi`R8hu@d`xQRc-$dUtJCz67+a&F-%sw*TXu?cjJqn;H|7E@3AwRYJv%)9u zFK7M(7`%%wMLyEw8=@9e7_n3g;*tQK8@l&2NLX-lv*?S^`FEPpH=CwisU z41XJXQeqt==D$7zeb@GMFd+1f1zt&U$mFB07xwI-h?BnCQH1*dH`!@$Ow6jT`u0iR zkO$yf6ZE%5519@zgKja3Ok80d?w22UPb*7#evYz#zXaj$r^)U+qF3!3kD0c7X(YPw zlUzo7VACQga(*+ZOP6PT(7uh@E2bLF8!pJF+3|4F9;$0v6s?M1da!uVuj}mdHgM)r zfU5QN1Mg4EKtz9F8S#I&vbbNmuZn>|?bV-7)GDA^1Ie8=QrA7VYqV+4_It(34HD`q^#s*=4nAu4} zkN;>_N7wnLpwo%#f6K)1XTw`rnV1HUnelZ$R%}IO6SfKH|r=(_cbRUfn zRgoM;(xI|eYn3ft4d}HxmmywjogznhrI;#EIu#Obis?A_or9iDnDxu&RfPOt7D2RV zh!(B^Li<$qm_`K0$h)e(a%QTqxy< zyK*BGwlHnOf&9Rrpg#DE2Z+DPg}d9wci@79XBbhkln4w$2ch|Dh7qVl0;6pt3U0_> z5AaA-l%SWyER|o8EwJZLnFo#wnB2%j5!$-68y@5~oeydgaV-8Z|Yk5a@E9{_rGaV3x>}E2w3Tc%m6VMOQ)yE$8407GN>@C>ik?w|T$PWufsD zcbR?oZ?vxw>>W!jJ21CgGydy3l(Ei~+y-!^un8XfY4}H0y6dlP0+2Q5Ed7)~ZHCH@wPcFfWF`(ZgAE{RS6vDqLn8f1m1LOyeK%b&A)0iS%+GM zHI7bwcL}E=x9SD0LT<4aN80!J;M7VuMwLSiwu{?DudwKIChR=5^z=p#y%XemmlZM^`k%YYa!De?Cy&wEX;$4h${=ST!84V7JKJ88=-rh?pqD^oYJUXz-=30U|ULxlbARqXr)JU*@Ui2SjAXxEO(TLx?m-;@*FYzKt#t&Aq?7gaR}S~f04Gm zWY57NOS`#9Btkit|7@F8zugc7k;7znSc>Ts-ihU9!GAYEl>c=dMMEn@u^mfmc^Jgf0ki4l{7==@Sp zgz1nnjBsgr$@f{wYY2k!_Hhp9!VeG05>=`PQZrM`AhK{O^`-nQ%v+2%nHuvoT1mCv z=R?_ZwYz&9?zCM8INZ~Z6+5oUj`6RC`i8l0B5&f_0F*O+*V^$_0hmM3&lwK8SJVq7L1lN*N@d99z8+?N>&l;2F1ibZ+3-Z7S5)SLJFk9r|_b9nPaT_O)C5Lhxeoo zs8F5=zuiWoy^fJtPm%I7NDQ)Ms?B_7!RL<(XSpmrjvICHYA2O*hTg1juIN%s&|V5a4ZiX8&>>o`@W_h*K*!*47=(^}oE6ypm6=0PuCg z^o^<29R>T|+Pkc7#v4kd1BS!UC>jQi`W^3G{$GhoUMi4 zzw)!WrkTv{f8%Vc*p_)4t?Nj`+6h1NQ|pay^mo54D{pYI31O2IR6F$hebO) zd$PJ5%A8+w71p`7_TBzVZ!(YH*SF#>xC;X+AZk5KS>*!~Zy=B}FsT}guEMJ(4<&KB z)}=}riRFa!xW#)miCd#ZBx12W_%=%EZBo6}`|)ff}D05#v|Ip-n_3GQ$HnQYmuJ(jDZ<%O=c^7SO7C z44d-_pxTtk^P{pOGwJQqUf;F#Gmw9Nxx^Dyw4PQ2aRUXl@-x;V2BY!72uG<slj8+ zhVGh%_g>fh-Ol^|0!mlmy@vC`#O~a+M1dc+ml7xqwsn5y$`Dd$?nUu>SUfeEzi*cU zb=4O%N39HGyA9QfM#Snt*S@AKF1^_*+hT$(a;;-DJBL-__G<`mh3m+KC$A^%z0`U6 z?SqQgA9h1|S!-$vSf%(LblM^^t@Uw&(`Yz<)jqy|?7;G1CLds2t_|M0c}SkT0Cd>r zXuhB#bYFP@w-NVR*Bi4>#{{Mju`9KNwSWiG5Gx|qRzix<;8>5A+JML=Ww0<8>6~Q9 zEy^`K3BM^y+Hz$G5@$#ED<^bEkdgN-EMZr5WtA1XWOwf?CU&0U1oe7~H^Zz1Lh3c! zBrkFNr~UEvX$8$(bF%c9n4CdpMFdZ+EDbg{31ZCkqkCcNnQn5Wa9#VbG6{<*O zE%^A&tS$mUAf51}Sd4c;>=Q?IvT23teE(rwLX~h2q_xDxw@_$6^0^|}UtbZReY

T z6SjA<^2L-7EJ2CC5{|8;P~j!2;`(^BqD_3`7CBESU$TzI;jW?P+49!(!z_B4*V=G4@hSub!f(3=l4hMY`$A*0ePpNt|6R8T^E z7r1ae+xAh%8v@5IQmtiM1R4Kx1X>K8k{T*8Ka5W0ji!P4HDMKfzhH*9>-dGVg?v>M z_%Am#tN5|IJ)0;*4ETG1F{acK5Zwj5+B-KZu@hdem7+sa0RUq#Rz_R!boEB?;Z>`| z(bL=^v2`OU8@$w;``QK-q=6Oml;U!_!Q$aig37u9@0SwM27=5Z8Y%EfsMXNd@@zqC zDB|&DHil}e7{}^j!elk;{R|NqvEw!X7P~qu4_idcOUEWrGnng3OgqGmUy7NJQHrs` z2-H5Be-^QzNFEmpkViBU}e((f+}x7Fu7z{zp< zzQ4s+7|bk7yH6C_L6FEo06Kk|D6J3tm2Q30qzwgRtNP6m7 z589t3_7CEEGWA?O?}A~d}b7 zew--~S-U#My0;`{R91KqM}pB8EiGbl#tpeJtjUE3;LrwyZ}C^%N~LH(CeAI}wPPX{ z<*t{z$3#!|TxFhWzg=dE?=`kM(#k?Ev@@I|Sd?@!ZA{T^XQg2&1mIQQtfp91=FVdD zWEmooa0++JfiD`)Wlkhyj0_x=z6hPgOvGb!Y7Yhew>3tZ?X1 zgJATudlj%w7VgJ6KwQh+V{L80lE|}fBXG~7p(tNF^ygz)wvvZn|ydGPd@OBFhU@WF-)X&N-5V^$qeK!@7)RcplcHpCE$9YshlTbYSU^&d}bGnnP=p7poOvtNi_&|pLZnEva zV>iz`gZh(Pox|(5@j1`wdZD$?p$ZuG!o2ykYf=;_@7G*Dc|85|+jg5dFKw{}^&JH- z^^MQAe}~?~Ni2)U)s4`^$Vg4QPZL!CoFgq-Tld0!q5aa%(uL`!#G|<=sO4z;U}Sg1fDnn2*zStlKpy!6S*f$`Se%;!&Os@# zr}Q=Of1Adx^TXOutV01`Uo7CRZp;sSp{!vjiNV`^q*~D&cM**wdhxbxFR_Rua8||5 zU}x!LLK<1^69btD!gr8)mAkV9q$UvJLaI#3=C8rky+3~3BdC^Z38*ekv;rT@Pz6>h zpLhp(P*MO+gRp)MTHqmagx)_s(X4A2uL2;A6rVLv7@B@YgwBCPU+`(s{i=z=>a|W7 z)j1MG{Vwd|8Zez{E_JUtNf(zZZiPj?xAw7z*7)wNljOupdvH8O;&7OvIOeONVIVo* z{XVP-yR|PO0CcI0kNIcdpS~D_=u4G zMIe=RzYF%-l7MUOrX_PMGL<~ly!{hELvULfY-kmmR=Ol&364)hQ5e6&hohy6vBu@K zHl2eoee%K})fu-p&vRZ)p0VQ}-#CyjNjn>5r<3lXTv^;nok=+7Y1R$_MBVOPkz%Su;^cj!g@GIk zG}xY!`*|A~=cAUbkGlR{8{*O@QiwS*%YV^%C+q*2W6{y(Lz>P4s}(C+LQfAaO={!@ zDiDTeShIRplnHIQUymHBfHO7@7S`@f=>5}0#_!BmaUvQVKblPbun})D-BX>bGNC4g zdQKceoxiHi2AOmxMqX~NKp1Oxw=Jd;apgm>fJriSN|NjkF;Hmk94#W#*f^n zGX8998xbGOE5=y%I}e=C6a0_@_cAtg#gS}E`$0YHrDK7LMhwcE0FZU}I`vgG?}WJQ z`URCGQd^os%*!n69M??rs+B?iQNu%E;^yai}}%jX=@ z&BUVNWA-42RkU>{r9A(E$(ZWH&uNNv;uLyU9F!Lc?2s?bJ%}*m%_?mGBPcuoren6o zx@r!%OLB3e8rrK2`9vp^#GzP@QJf35pymc~JWIRt%OhyBNhjbmj2I)R6qJni?MM`} zlk;>euT3&)R0)xgQ>EfV-h6OqQPA=dw@fYuOs5w@ez*g-xc9|>_sD2SvwfkdS>;V! z(8@uU)qXv57OTaukJk=!ONRnLYh-fpT5y#2W(2v1Va5k)Z4x-s_PL6jI@80+=*jAL z?XDi^e0lT@T#6}fQ*A_8;jl4mvj#GXfL6!`7~=}+c(5xPDsgR6sgaZ;Gp)~S0ULd< zAu46dgA%j1#swMh4YvW|Y)jMp)rD&_t_Nk_P7jlY0kyjNp4J9gOoKO{ZG~i?bn3JZ z6Bg7wBYPAjDv^3c2!|-tyFKJVcFzmL2SIrS(&aBL4;lQSSh zwF$;pr${^`CPVdh&sYr+rBkB(9;AHI=)$QJrC}LNwg%mN#TkpAnGM07(MvhtEyhN4 zrTWqiMk!zdUaaV{!}h`oB1zoB_2b(D_6YIfA+E4dONP706d@s$RsdMJ%fq+pg-_<) z2vH<=cZu4?T?MzgT-5536=ZWZ3w?+jp^Lq=K5&I-LtyC5EZd zt${aMn$~PcQ6tbz0hhRHY4sdhs|!a+GLgYN_$c+Sj9>&=6}&B)ukuv|4x5d=z^izK zC+O`e{39|Tr-u5&nUM3NbVm4qQu%VHVMw5);)I9={HOWP=Ml%P97H{hr2B8}Bm2z* zyQTyxjQ7Pw(s7~1P{%G-KU%ad1_)uKUJQ;iP@&kRJ2F&yXR40!@<^vA(Z5V$CXJ*m zWh4x4{KjM@lGgtI1^%h_0$b|^On~Rl@OlRkn8^7P7OkoE^l9p%eHecQF{^7z+0UbV z!H_^-q-xa-^`EdAKSm@CyBL`FHPbi$qcqMiyw1Y_WO*=%PqZwmL8D=#f5vdip>WP+ z!XLo*>U-z^p`wU4j+RIsX+JrKc*a$b_u8|7?jR9Zf*Mw;Q%*fH7QjGHq#NBg-C zvoI5rq*iVMFEtg-Z(`i*V6r~>lERu}JOS#pHpB|us!!E2n}4S{9rl$Ni^`h)awI&P zuymjVKhrnEnbp8WRQu-5NIv;w$ZIXP$ytqv%2BGg4B+1)Ck6Q)6l$h-4#c!#Gpdt# zdJrhSnrRcJ4~mo5ZSN37?ADZ*eErcEvg=#wE1-*>ywVe`|%S7r-t@{Le^%ey!eCsHUiEbFqs zIKoTyWv4EV8!IUb$)i^n!g+I^($K4|k&P9Wl;4-VHB7f@ZKjEtYKn)NzaDnhbYvv? z63M4pO{w?hK=?zJD5M~te{!kdXb2QcL+ty?Xk@lc;C&WTYJ5B|m_=k0_JlXK(OO-f z=z~XQ9*Kg7jyA?LszMEr!PvBH|2zV&A&?4tQ{Vty;o?sF7P7qnqqR;|EOg<7&#%>;@=w6&Uz^`>x1JF+F$aJg~b z;oX~qp`yB4cd>2EY{wIv_IgJjT2WB6z@A$gk}kR@-Q3hD)K?&_p>3>?esPfZi!GA9 zn_N!TMJJ~qLID`td-OPt8D`8~^|2ialYj>f}L+qT|qyBol{M1G=tu36%#^}s0q=OzQjZOFdD9+g}xk2Tt z3GLa(M%O&A0{j%m7kXM>HY!DxmaX*&QUlQn&7HTv4>?O>=;e()_C`YU`<}J0vZ%)r zt1%Ck#F$ChI2|^Eo7qq1t)X<)75r=`%h6yIodgyVQp`VMKi5B;Nia3fnt_WmZebR} zraD5rkG2^jkHz6J!dz!bhEao2ClD>%q-0ms_Oo-32Lj)U^?dBhTI_T3xbYU&sTsWL zEIq6hA@ef|&pCuaoDsuecRyp1%WjSPEX{`5&~7D=VX7rvsSXkb_2MDHW`98@WOKm& zI7H&%;)fvY{9YwAyts0Z2Y^ft8>k#u*#(6u?&tA{zI|TVRwPh58oA{=wxJ&J_iCPW z+nW}fJaBlo%bl-|`->ZZnUhFnKTq6XfmV4sK-6sE;Q4Q2RXyk}QK=4IxZ}4hkdeVM zYSZ^^Czm1WvL0*3w;*^wH3Iw0#1y_Qa^g!H-fYngd9@!hq05jqYB%@tD3i$4_9MA~ zhsfFg(&tmQxKe{1huY3SHEK!5K%guys5J`ek9are! z;uK{Hh}iouk=Zb|u?Hna9Hntagc8jRmQTKrYj3-B7M+GB|EuFmn+jDKz*7 zxxGQk{Y&I?<=Q+fsg=wyStpbGWuhK-N$8eGw0!*B=CHJ35v4JWk8Yc+WCZjEHHXWx z)nBu=(_w>XtOWN8-dbvPU3lq-KfNaLp~Wxml&G!r%D1CWfjYxhCvt!Ma4ptlA?^;1 z`x75eNpzubFpJ%h>O|Gi5=?G{_K5`5u2MnOPmb!PWuh#?kjaee+W4Xt9ln*Y5U_MA z$vpNmO|awfz-Mz@x9#MrSiw3gWqtQ?dNGag*Wo{B==%q^(A*v2cl^S6uASC65mSY) z+hMfzS8j#=YpH)jS-LwSPnzbPnjSyS%iyuu2*=gFw;c#F#Nt0DE}S9Rd79I-9qX6p zkW3*DG?YwqmqZat7b*5uvH8ar5TvkkLz z+#?I8X3Ka~4W473TvYS0BnhgwQ@7ouX2`(D++3aDnfCNKTS9_AG(X&whI1G5(I!R# zq>J6(oY*S6XJ@B9UqdR^rC~mYv-5fBuS;YC;B8}BM${LU0(pp$-j8a)(O7A5jeh_? zmS!oK4{DD3AS5F3wYITd6Yl=szAMLXbTfY~V?E=(X9WHVc^6|BodUBd4U*@V8WHcC ziL>3%J8GLV|8eEUmN~$w)P!QMMh8;Ks@i-3Xzy<9h9)BVegko5UG5n+c zhGP{>&wT#j%IhCTQF#bH)=JVj050 zwCysIbK*w4m=j_qN6+OR@pz`iEC}QGDrSmKu7Phc3i99>4ELyjc*@Rx|FtUh`1vOf zVI>_7Y}qA(syefM&?=(5;4y`iAm8j0+~4fYZFLt@*kL}D<#J(5BfY<$Vy00^4Yf71 ze5Xq0!wjynnZg-zPB#ax-LX`*IC3pzv7WX#;lUX{dtyyoI-9*4xF+c*#6we$iT1&= zuk3*DInbLL2I)1+6fkJmL?@khf-(&8a+pCg6rbHBMnS&k82nstCp>*7a{G;ICE370 z^1b`VsqQjMJMvc_o5|8MXrC6vk}QHg&>RJKlwmt&jNc_?f*2a+w_{Q1qsympSN#uj zrn4b=r1`DX?0g0#T{E*-crHR`d1VW*z|=oKTEn-#2#Z73zDGTcpw{t8Nk_thsWw9~ zyAj`DkrR$WVb{@An8a*#%4}7*4isF>k^!A3q=qvI*h;GmvUS=vh|!;I^1A->dQ9RS z9cfNR9WQjYFML%S_te%6kFxc+!;QM%$VTGEA~n&}&igbWD==pXrp3?NETPW!G!Z+= zR||*9bM}^%#sy?=g}&Y62(ON5)px}CrCAw|>u`tO?u)ezl#&nDulzzYZTY*dCfLi)SxisbQDcMUhSiZ_$NO-TzSbgSkGf+4--%g^^QJ zr{8QL0N?E*l!-Q0G(D8%TJc{Zw)ZMg-WY1zPint8jn%x^)2apst!)Sn?tTI*^L1+HAK?H2V&qo)*iL$i=!4KYJK0%sbosl%a_3TFBn4*q zP&u4F9*iJ9_&Q_80_$Yg(28C~MByO4`O1WnEeqjxJVno!4SDh z_N750HpzQ5T?EhdS6FA6y!C$RO!Vv3i@L=s7QSj+Ng=T*Qn456xy}pAu2APcD|c7T z&d>w*Zu^J0ctVM4(|w~B&zYW@nC=yvaNtHaz-rPQw2v7bEalEuz`Q_@?FEyV;sb6ombd!9*x(*DOdMq_^Prl*_!9+x|8Q``ko zZ3bNA=S04?!_K#49VtJ)U6~e>V8d@@J{2@o*2!2?CGND_r$XyYEl2bE%Of|u#v?0E zE6k$_T-A(pyfZ4_jtka7lEDuTw0iX@J21x~cRCqNrC6=gdO7$&c2h%tnwSVOg=^x0 zXxk!z>HOSI{!t}*J?~Hg6IC$uPaq4ciEd64KZ|+SEiR#3WA6ZAK%T$z_{OrA&bB~7 z%s`Sh!&D#rrzh<1*LaXv`j7S5xQ{bVbZOZhe3q$7KNqRz+dn4{0%=rd{PNad zEd{eJ1W%KdjN53{nhD`)(A88PpoK>3t7kqJzXm;zgmW8W-`!+vOgg}v71cxZyBWj@ z5y-KARfk7{jSDPiE)g zMK-#+Uo_0&e_N|nLAs!u8>~L|h#y*zaAK||P7hN^x38uK3(Sy+l4}@r+-T@UKJ*%B zSNab!!igD_KSU?u^5WDy7?2coTiFVYnF1{XJs^4WKrl; z7MQ%%;K}!?J$OE1!B#{OIe+yLutz7mOwI_)OMdV)2l?LRKMBAH_5>Lspa-@a4wWA6 zbe~d1ZrSn&J7?#P8;-l%qX<7yAJ87q=h>Ve%zsW<9}9sc9)lV{K2E6cu8$f6<&1Tn z31@J6n@T{pyZ1B}GDZhf`eeD80S&kCj3^IeTAe*@u!te8RlS|5LTmifU@ zcO3rl3DU4iSJ2`*1y+tAw#&M>}yV}5!*>pa-!soJ?VBhjAOF3(B`O~?Z};u_r_ zf}QOyH1p3t|5t!Fdoc;b+&!7GL#Z9Lv^s=6ECk^Yf}VWhPRS?t8+=J#wohLk<32_p zku-Ox_3u6uY4NQm!^)3zAKfr11-6ZXo$sZqAQMtFPetG?bYFlWqaR1_T3+?~^F zEs>dh?Ac!)ayl1*5MP@2-VK{fBe|Be=w51IXt_YR5=Vn`N^#*8)8W9(NRavx;25{ogk@;VH1we4*jB)!)Eq7?QV{+)LMXYPBZjyou zTqKeC(;P|sIdej9gH>LoLntk;jhFaHm3m5TfF8>u;2S)3A&17abF8~H=T_P%Iv}8z z)_C+GOvMQI+f7@3pJoxA-1TeF4(2kVjw_hc7V=bW`S#MUlz#)_0A9Qlc=b!ilFjGx zGJPkf8}RkRib9-*_bIxUCA?m5+?{KANhK^jtn+?RdAjr4snGuh(RFjWPNC*5L?RrHf<6eZJ44fv>5$(r# zJ12_gfnUMA`>DfusgdmH+hG+~=~&`mF7lNs0;f3G9JL$}DSVP_T-1(@l?*&dAPvBb z;G;P}eg^wcNAPMrmDfL()KW2nQ&Ze2Fj!v&FD4X+7+hp;87g2Y!*}wF9p>-mX3KyV zdYns)BMQY3=l%qGDUEk@yXwZGD4B-rZ_>BMYSJOVo_$?0@695CJZBAjHu#Zvzj zr46-HjPiXKn@~ZW>Uf>WwXm}M=)jc=Wl$__^S#J-EoqT$FIhazNA*O*pk$@d* zWir+^M7jHMKNfV@9Rp7g!>~wU!T&Z$9_9+zvX)2I3Ay#JKx#!p5NKOAdM+fHK<2U- z-i|TB`rO0quhIdW2ZicqDi{we7cS@vQJhOHx*Ix7^T&YItRo)6!Y#+t@dNFDivCfY zYfc_N33yLVL`&qf%Q74&>#1jY^f2I5J>NO~mQ|`{S|*JD#C&~ef_Ff&tI8Kz;6 zq3#u%_w6V8KH31N8GGq~BDl-1&_$-5%(C8fk}Pr3P3x=u`Bl zwO-P$HKa(MZ>c0f%I+eS4t+^L@mrJSw5iCmS z%&14``Ri&A3yak8ZT&gwC9Pn91mJNltqjixyEr=68h&k$P*V$@?1m`qInrOf896=S za4WUVAL~>Bm?4W0`mFN$f27Iz_&2XnB;yJkMbdhti_Kvpp>pZ( zzhM$hQE{;4oDs72;Tg2;(~!!Hc6LJmL2LNZ?Rj$NL?c~54d#Jx9WUcJz!J{B<4-sk zOd^{CvL5mYx!C$c)-wn9BHKf{Vlnb`zrkN88ompwxbX~trcjhDp_(-eRCDwf-MDN! zTb=a-g9vm|X}tFAv5%G3(+5_0L$)H7opvh*2}ga0thfxW9JF2*3Pr+4^lw^2fZQ&N zgY49Z57FoD%aTj8LHlAN(`@n$$}w>1+O0ujoo-0;xP< z!Mq`64Eva0f#y126K9$4 z zP1T~(9>`fQJR#bF&p>3CS8JR1biH4#PazQ-9er&0w@?BdNoBQ{wj+usjsn=q%R(wd zJ16x{3=`&$bLTDNuDAVLW9>-oU9CVgkXof2IR6DaG;{O?@NPx_Gg zEn?B^HE5#I<=VkQF~I6E?-H2ZZG`C=oO4UJIpQyy;SKWMA|-np&xY_dcc8FgxVo~k zqMJ8^E#{1y#X@#n>E7>~+@u*jxeSOULv+{@4s48YhOlwOyVMgW_K_X+@3eu1rRihS z1|{+RnZK6D-Hu{Hre1ks%NlxRAQFEo9Ix|XPB`i3T~~mxA7KM9;=Sx)9Q~kp0%pHA z0R4xDPRRIkfRm(;-%1F+&`Q>|8Z;mho>vck`(%W=hNn_@Fe>)=r;A=ArQZ1iDyS#v z!M{;>fup3VgB~C30O*;OkgxkIvmD|I4kw7;I{auIRl`Sp=;BEoRcV$V!dj0QyOPZA zU8=AQOTxK!IODgsardh*w_<(IR+}uitSKB!2PY!_mQe6dvGMtMCqHM^G*^#Pjxy1# z+meHNq&^amu}l6@K{2zG^qwbL$1zK4LIY@bA@x{4Sj^I&hT(b(&tRb``~OT<^TtNr zu9^vniiQL63DS3|9Gv(U+uN&2OB_tfwU)C5^E8PE$8H#z;g*=^wJ#S(_uyU2X)`u7 z9Q#$CpRZ?v8DEY^!dCWlGKr5@QBCI>78L=KSOfZvf$kd+FE8dDtJ>K8`_ZYxgt^EW zMHm*iE4BEmPQFuhE|%Mi?pcA?3~!(ItH)O+s5#_GO~l3cqd+)x#8^aWvt#-+UC>fP zc>(}$Ct7s@DkE|;W>CzgPHdmrgT3phY9{xaGL*yfb+P$Ea)&Wnx3@0fAni^8@+u)Ypec*uxq(u)hZ3cGC8V~<<=xV3K?!4z#$o(_5PCe$i5$oHP& z@bsEHzD$>edRPTD0MiD)NDz^F)_0!cei2MwrTKDXiNRu=z}RySE@Z4cQ~FAy}Z2^7JNP@T}vk^!~W_;@ek(t&emhCr3mE% z$WR^v%m}`{J~O9p!~jM>st%X7f%_qP(;xl`D~7 zc}H2~#bhf7OpEWjlm9vD2qatdD1XalDV4u$ZQsPLaM0}(GTGXNtM6z_ox#WMF@1&w z;dH3FcIq_xe|a2-z`AZM-W9%9ix-Bd&|BPQM11g-jfT;F>7tzY8Ejl`_7??!IY9tE zLxho&-LOo6#=DGxs-E$J4`Q=(veDRlMR}0|c=p8CrrfsKUW1S61!lhyD0fJk-r}qa zPn)?btw}`cY;0sZGf4a2*P?61ky-)b$$@`+?m$E)Zjk;Mh#xQ(&;nl1eJLz;PDw#~ zCNZIMapJtF7lJ>VaVn|_z>=H;h>dK&{irkA>|VQ7nd=~MWN`0!fCwTYfBz0>C~4{! zIUT{n%A*{t+nT1%yoozXb9zhM=t!Y4B7lnZlcN2znS-mcS1Z4hJQ9(uF9}Z^n&dvX zqNg&ikW*Hmfd+qTsHJV3zP|cNdd{P@p(903j%BvG=!|&q_0NBjGqt*=bCGky1tq5+ zy^Cu9uyPv~iC6E?^nO=A8LImj{;>>4UemQTp0TnKQ2sTJaayl02|sK0{)BnHSUPV2 zLUs>|kVWf2h4`WKL+tgQB}_n!3PMO(o0%ltD; zmo+@ICHBPZytn5iQxn_yH+I1yE$0C_&$hb1m^e|+sk`&7Bd+wKXwu}=-0(kZ&*_9~*ewn|T5S8)`rL)<0%ZQUF zr6)GpS6yc>`ax&=g!pZ@YHw-LK*XqVjl!9pyy^-rbgHU%J#Nh^eo+Kno_%_}&bj~M zK^4%!=>?WSfN*L7=q3IX$iiQDw9MN&;vC>#Tyn3TO6QyeG@B*W6ygznZ6>f$aA&zl2^U#U%Fz~zoX!!0lUc}TzB#|8Uv+x!JHRC#2tmO!sw)M z2|s{TIwK)=De#=sMgE=WY!YL!KTTd3yu( z*3l-n_1bKTGEq3(Jk;>U%{D-#-0Xk!FA z-ZSn>oL53fFl8?Lc?nY&FCq$B;{48N8;lWLu_A861@QLI!DMLr>yO(kI>3EgJ6f&< zj;uCi_Unmh9sfC$FP2f>$wGs8eP$>hpSs5J*3=B&#JlxGA8(naa0{!&r;Z zKHsN*B`ucZ=Rxr%>+lFF1?HwB$M0jd^M%U0ln#S=g*6(gBN?o>t1B>ALn!?pO9SpG zg9`Zl!IjKT_JC32@kabA^5T`0K}ym<%ve1V%QeskL*xNrL%&js)xaRX%@Q1Bi|CKn z7IVu$C&B`VPonP$VwFEl;|zDoE=^LR`lvv+&8sH~3d=2E94R6eh476pYUV0F-goQJ!;#Io!A_9RsIFH z6NAUrGsRHHlA%^W9>)_AB4qN`VnW{6$=c^E(d$dwbs&U=?Ql0vVQEwHgMEM8vcHw5 zOGN%Zg=~w^b82ayIlBPyyb=qrk@le;n|3qJC2kyH?S_Z>oQW<`ah05fWal6+pce$% z>Zl1&+OJ~#{cT1ZeZN?8(|I1mhOi0uafE_Rem9yG}0oEDPj&My4~YmMKDWB=?n#JI2A-8ie!m% z|4CO-9+-~;sLgp^8vkw0I3*(Enc`b5x8`eo$s67?K1-+BPUtSxkJEe;dTSut#@7FE z!9L@!zERp@j-Fq(GB$T0o?LI#JesS!IBOEb1%LA-pwU_L{qo9BL=;eOKXH;qu?J&m z&CyJsSLHZmGD`VOaAT2(55JcO)5Q@Q19=aL0yBOn>F?@z2~)T7yyFPuV62nKG|s%M z$BD2n;{LCs$&vJ9H?RCJ@OYZzGN?Zw5rBs!N2sjAL=>(OnPpZ3fHn0g0Wc@#Gben9 zp_K~G&(s!5I$%xF8;pblLasU5X`utP&~67Q0cjl(TrI1%xdp_}y9l>`S&Fs#KTjYS z+9Y)z`uxX$_7)RGro*#?gxFH+;Z(FWG)BFE#Yr)2g+DF+7=hh{#DK?k-sy^5-ad=J<$i&#jHLM$tp@*z=s^I)??Nkj4 z>q-6s4cG;8e5;|wB^%{&0N`(U!%mpzh*FTzqX^3t%S@npEh8Iy*r~J4$UN-z{m-Ik zE?eoH;N&aH?Dqn%YRdhcmtBV8fY|Ao;>|9eZ%s9_S)92-PFrv35;9}*1tw@scNfk< z5PqkCX0*6re)Y^!GsFeG`y(0rR7g;Ub2xCmxRI zaTG}U#Fx44*iGK5i^LQEA0A=*2;SW23UYV`a?6E&V{|mn%?^1X-?3MXi+1Zqh#Xwgs~xmfL-)xh)vL`)JXZ8nh&QCmq08SZ;x>Fr(esw zvf4>7ZOs=e(RQe#gJpIegSneu@~z2X>cJm6$SbPER&ZBqkS z8%f6YD-9q`{i zy8vbahXb!r4B+wtr@E&ym8=R0^Xf&wdh++%xoh42{sn%iqwY)jFUz0M8&dkS~eN{Sp3~N;{)SDwByH&^94wG0G z(EdR+8GuG}I_H+I2cLbgktbd1qQC6Y=Y)bc#~ehz0&uw6S=I?{zlM3KuPPGbGs;@y zFw+s7|J4J=-|Zb_wfAuM+o)X$MhW|!p#&AVzo$N&zSIp<$=1NNy$p&*RU|={BHsPT zg5>smu`HgH=OH_s2UrlCC12fkU{8zA7b zNr#sPdn17*iWCn~Jvyf)N6doeU4^)K5M%G_EuOT+1nWO5Flt0z|9pvFxrn@FAQ%bi zIz-nUwf-dFZe&HE+w3oWfRU_4bjQTHcUw7B`vMJ?EqF)O`;)R#s-B=~Ec3S!& zWX!(8tud)w>ZzP`KMb|#%MXn zA$gSnIl`PgiJ82^VjkY4t{Cry1$xX1b4%ui2y6q_Af-@I|Qxegl4^84dn24sV# zbQ%}^M^R-)#1kpyb@piAg-_z2Wd#^ghNjT}c=aTpt_~Q}p*X-h#nJR&ov%!asr4G( zBv1OB{pMYL*{c9&M2;SHBzR2Yunb9xdaFs|wXLuXRVt$@G~#rmqCoywX2sbC6}n5|u5 ztOYbdOnhXu3nPq;WDpU##`>aVwGz7}@p9Mc{X?Pd2W>C&CbjiEfvV2Mui=}b{uE75 zX7?p`EQP^0F{T3YhLqgM8zumYW)t@r#Ymi|i)f5&ifFvFJIcK_SH3w*GX7W+@ zHC zmw>YCv^_NXMmdj=Wv78DySNprr&qkqth&v!sZ+o>=3cT{Pyj?Qx#DNsq#0(=^hLK=?R=;;nBsqz6cwcahDD%W70q^iGDnuFdJ8*L}x=7osxze9b6d>>aWzz1|lwjkH` zUGK2YR1y1ttHTH`gk3yQVRH{XF+QjG?W1A8(-4=O;*GpuT^e;(Gpz-raFxc_d@PEmQ)Hd2TDtVkove39td42Zocqt(demfEJJ5eLQ2vkSgq2`6?4DwsF4&X!FeB?)DwHwJAY z=OeTbf0-TCI6*MNny9dhg{ODbMDXFS8nEkze16M>7I<46$U-bD$xfx7FNuj&(*CFr)*xM5CT*yf?*b_*clIz z8&XZ7C&)qOviKU8P*kuGa^MBUJgVE5<(=z6jmSOyfmAAjS7JX4U z&c-{}7usMg1VK{&!)@-@g)wk1T?4rO6Y-@So}89y^DXVaUVn9d+2wR#FL_(DkzERL z_A_fZZq`5V(8*Slu>Tq3!!G4jeyzX+^2-999vu* z5-}zdi2*DfPFSe+AWn(_rB_Ili8psR$i)-_*PZriFZyGdJCXxti-gvM%#jjBL7`zW zQXc94cD>|N8FYK zz`T2nb8zpMA-VniL%&_2N?6*RG!%8v4{5=8TYbVzyE1h^ZB8`FR-#OFgU%`pzI~Ft zop7|Padm|r9jLALKt*7dz!MZKqWgHnD-KNWQ1cvf$3`O7Z8_2G{}Wn?O6e-D^0%&T=KB@t$@bc7Z?J47%wulfd|n0!e#b) zcWfV5{JsQilc9rzUQc0V0yfti4yOEP&Gp`Te)XfwY?zvXT~brX_@cY_FA(qHh}%V& zB`l%NvOS7)-CJi9z!j@TSO@(a-E1~Pm+*Prfui><4G|Am{bX`#X17U4Fw3D1b`WcJ zCF?)=`yw~;BiUzCYY1NvsP-h*O~-BEd9^qdjXJY9<)wOYvB>>Wcn@pRv&-^<+TKV5iOJ%W)=Fm2jAS1e1< zHb~;95vfII6u0GwHVO=nkvXq2hSa1{Z)veh+|y&@=ct15Te}Y0kFX1Pnx_cP#tvh( zgg648|1>gEKadcyrFE?C5yW=YaS$W?V`Lg`KI~;U!iUEnrPOy2K&r;4nW4QeJ7d<( zANtbiIpD+Dp2KPj7U>?8BRwpLQhIn*U)t{${{Af@r)~BXiv>{^HwZ0#{6E64{j%Pv z*>uH8Ka8nblX_@Z?PEABj&3}g{Lk4%&%JV!n%@iD@LsLF zQ3fZzPo6uWJh&{>i{rT<&p2Lqapn?-dXX#m{=YY{D1k+)E5DA6KzkdlLqM%t@XxLZ zx&S^P>U!;O-ybiiaX$$Mo+%aK&>6YmHce479%VQD>!g;p`ZJl7Lb|M6jBDw7<9&Jl zeMplLQcgDC!d{Ws@nd2YpIoND=L5UM07VtESIJkPDx#?M2nwe^Ml!8{(P8gM$VdT- zw)>?V79&$XkB(oUd(%&$5}bL~Rt{uj!p&$3un!S-fi^vfVvAFPs%zv&v4k#AjEU6c z*4U(BAl{reS1mON^53Fp<%H|(>J2Og1T7FCBoe{iJs!T6$0CF+?P0> zT5RcZyE~Vf(P3eNIU!YzI3!mkQ*d6t@0z}pcrjfsIOO?*@x*YCe3@R#79u)mrE4rP zQfk?TqY(n1rdj(U@^lXdqrFCte$yG)4YKaWY*82xqd@QWEl~U1>Y9ZEAl+#i%%YZ8 z>-@H4t%RwbD&_DLO|2zK`yg}Sx};onmjRj)3(ei)@m;FhX;qzHAJWzew9PQDZGdzY zX!8^#4GJFccu{+rP}Lg`{nf=UCZ#v6Y^>d@d{SvKYtL>`-Cx9wKPsglGKr_%GJNDd zv;fmANBL%zid7ZQ;O~C%9RL=1%bK7*vJRyu;=jd`hx+*;2ppRpPxrDnU~<wwT3M0?u;Y9>bOS&dA$UJH(flvszK zJK)iVjaarJZ25!5>~RCBhQ56jalSt&RAx)HlK_>{mcSOQW`;J0RT1l-LE(XP!wTUDXC$*u z&^B^?jV=R7$S2xlEP6V6Bb^mHiB5J}v-6@vBs{WfydQwWP1weN{+nJC?bVUBWW+x; zqJQ5GbjPr*5c491H;gOm37>SG>xs*KCM`nFT}^)(a^5eA1e&(w2&r$m9N6t2;-zVB z_PXCkWKQll#O;Y*$z*>5_Z`9kqASWwZ!5<-%jK!BjBS&-3?BSJreFV1QIZaX6WwgrYP_D zFv>)orCm&ou{c21j{C^1`zHdA51}DEX-!^jwWJL_Qi-@W^Hpoj1;{BCz4e6E|SCoJraV( zATK<`s`N8EXPEJ(+tspUK3NY%D!PrHlvZKJ5(vd(PTK5D%aUb6@2$1>ErngDNontt zDIkLDVY4r@>eBkjRxnaN24b{T!+)MMBt>Swr2rSUU*HGO9Oo07SP_y-Sb_vqBGK6c z_zXNZe4tu0pfdmJD&Jn3Yg9+82R_84?4nMWmI5#-@I2If?x&h#qHW4Q^m(YnBC^q` z0nLudor<*qtKWh|bM$Nqp%lrq59vjw!tjh_-Du7j{Jt|teYVG2qcNS|l&WP#fwfK; z@m_DgDhpkH8q+~q>*{wp22%dbAR|}shzUnNsPhHZP{%VQZ zCq^Ol%7Yr_5BL*yX#M-TMm3SlUW65TzOX)%U~^CQH%7o0omz}C2|uxUOiuGc~?(Nj>%^*yQz-Ix5lkG6G1 zGzK;Xa%L4t_9;YnnI4va!5n(9UFPnzQe3^Oi9BI|(_F zB$q8nVP0e*>_Le+(O@dB!v$s+n;*Yq)SKG2%+E^<74$sE@jkUsx4E5j;-{Y98PwQp zkXn=ZmwS)XIqJf!b?R;Fx|*)mbE9KHHq&1;xDR<9K5g%>^#M=P#E4)eXTVz;4A$;p z%6C$`jurn$6I_+2B33roVSd=eNY*TUENlvUL&-)$;Y^zu_V99~!c}?o)>oRd%?s|WYggt4rZhyx7q!Kng=Sn==xO_rfUB`wph#_Q%yKgX!m@V2MxaeID?Qog zzNkK9*7!P2o3YP?>A}qUQ4?_m9ogG_K9m5imTN3T{((=O;-};2Nz9`kf`0Tu66-9s zj#GqbUYKzCgmWSBU5(9e5*Xu#T2?R}BnW+w^As!zWSE#6SW{&vM$q+NsE3e2eQCB* z%R^)FXdfv*(I23jQIX8nA%0;HEU*hTH!`+oY|?V5j~_H?yYXNLpTs)g81|AT>MEL; zG$LyY5R#A^hzpJv4K*_FOJuV@}BPrLv|VDEg`#S~J8>eZ=^4hmDZ=>?=gG2|zf0;nXW#SX(w05R}1HFLdll zTDxo%kHUek+Yliel+0nxyVpj42T0`mW8mUy5+Ax+&9H(q3UncOh|z>q8H>1SrT(_- z0k2uG!rJN5*j_;niKd@Adjw>h>%a&QLy%PJ+SF`A+O^8YVd^WLx!E3UgG_d9F5sL_ zZFP==^>}DWei2iP;TycTDeo&wxGRkPjRNW(bUHV6>Wl+5?du>Nj>xL8g8HKEv9W&} zL9z@Hjmx*b>ExXO9YfEh_5UU)LO|REx;(WIS~kEkSQj;qv~2>~V0h;3d3HP=hk?~F z1cfIpAI+nLZN{WwTp1kxLQnuNwnPwOQD39)Pf?73)8d|QN^y#Cr-XjA+qXW!Y>G3P z>e7;~w8>r#@rWF0JWhIPWnhiJMn3uo0pZO~_5K}6S7ta{L-CIPSp+lEky$t}OSou0hOvdUD|I19Dp~cA! zN=P%$nuuP9sX;h#5??am!p{X1I>WfC3&SyKK@C9kfDZ<`{davNBLm z9)ZDe==MvUYMrPD!Z7ufrJP`3g$flv{`v^C-^U)-m-!$oL+w6oP2|}MMoMUS5wu7O3-0#(DGWq|T|$ zRBprBskYyO?Ld7<_&E0sLN;mF)B~9n>Z8gvHtLiJepp`BdsK~~z-8zw^%Z)dChC%mTRyuhwmQ_H1JAiKwT1-%<3=yA!RFLs1Mwy(HNt zYB+T@L1`asMJmDWaAr;Q{f;WDDdW`GfCJjJ?0t!{>^qf&5tyrWCD`stRm?UupX?wO z%M6qx#kOBU>N$x66wuh|&d;U_I3x~?#S?Gaiye94$T<>zp z8#Ie~N3r1v{PEKvLh5KuLMaYc?F){fHdTaE5)o`Ve7vYsLKD66;WRz4EKEp1`Im+n zF*(lXxORCco|Mcw;FR8AA*yDETH&>ibjP=13C~JJ^pZV=os^DBWMPeWFFHe`GU{Mp!I5;H^Zbl%PW-pcp3^KjWGJ=FSkf!k&80?N^ zP$XMErG!S(P`Wln=sH}tp#p&HdW-kQvAPvoz!v-`LdT%Ck;cw`NLt#UR3l++?Alt- zMV^vz?d7~n@Ip=)*h718A}l&Iym99dg3bN|rM>&?`@24K9Ly~}!{7NL+y}Nm9AjJn zF&pnV+7-}<7|W`WTXaXf$sO1oTDLtQdH^GV$5}xK3o|;Z>V{41H_*+fZsSVl5Q~K; z&YRMR}etVd`; zJ@3Zv9<@nOhU1>T8uuQebK93hk&hG;s(RD;;nMQ-CA?mhfxkNk7xFxD2nR38sM)<{ z*vqj()&4&BjSXYc{}54IAN!(j)nse@F&DBShZW_y{0dF>+>Iy;waYEOhNmBae4~>? zeOg1F3>O^5H&N-5nRYKb6&crYgBd>5*q9*S5;?trlb7J+;a0EA3g8T`IO^4nN-v=u zr;6mDk;tE{FYJv@&cO96CCJ78dY^=~ z;LuT?^n}<(Y2y1h{Y-f6kFHCP+@zW)=r{HHSNuMh#aDhri@jZ&aH*@1}lS zu9^Tna}8d$4tjk)7D3u9y<>efdy^Uq>=Sh<#^B}V!<5V6ugp!pgs@$MlfYoCT*y?iByhA-UjBuH%#Qi+t&da`~-LBlq_TdzQcYOo{^ zpE02k=|r_!x{mPAVX@g%ew&zJ4hs3|I-4g1mTxA1ERC@r2dQk0t68}OG0y=)Ym%8% zY-Sb<3H<|y8yArI1wk_)FciE{P>Zcrw|ufbP0BR@a7m`47dTJ8xk{rQxrh$&s8ujn zu{>?gVe)qiGwBjDptCsDqDSUN(Yrng^;=r1sv?Wjl22?kSZWxw*Cv)U$}LSm*O>k$ zlo39XVIZ`-ZSyxv<0T=7#-4uw8%Jf57>^%Tw5gJ~QM5-df4M|nlUE%&o`QgYTB@1M z2~@BH$wBWY zUyz9QLnpPMnQY5_Q>a%5*_q@C!gW*Vb)vg0S%&JsF;%xIb|75mLX?eLOfl99XyNmnX*_e zppb`C(i^&n{|MhL(q<|4mZXvx39==Mx`?zhoN?0ybc`z}S+aAEUIKC&JNxt7C;5s)`}qN=T${o8rdi(WfFT|EFA zx)1f=?7^BwN)UKU?D9wVGEdI{lU1}~=n;}{l9e0*bY9(D!ZJ%gc4G!qVA#0!#Eyh$_8*5+wx zijYo-v-WsFG+q^>?W+-4Z%KLlj>%AJccw_X*&;WdnW7#_sgfP3ZxvflO>^t0`Ht!4 zmGitjQ;s`q(=e+fGV7C8rQQ=W$p*OURf-dIeDQ8^@T_=be49dQ`-U_keba-uP*;7L zDHCHl^3ir-BPS($SI`7Th7&5}J28$x?P{8s_Ir5>lb8pc*mT`q*wS01RW44d81M-`{1sYc9DJZ^iC@&)fWM#8NUmqTs329 ziimnEpayViZiv$#X`9wDU{}YTYuca!NQ8TEK#)dCRuK{Tvf#i)gF$`<>9$uQW01=( zd#F{w+s+**WAMXfwgusle}8{Hi&?A0PjF>v;AcEMmBzdi0o7$@R>AJ%p&38L1+ zugQ=@SMVn223p|jam0M*Zd2|g%S`T+)m6{QKuAm+ASb=q<)IbDa3vjf=+d2f9?7wW z+#6V?o)%>w466|p>6H7}(9d>yd>!Ob<31=n7|5tyae3kOKyG=*z`J*h&jjs3r+Y>a zWNlfd+5>{B-aB?3slm%lmefdWlj`!33E_PD50gZ+cHu@YNz5tR7FQY#5R`|M=q4V^ zAg}uy_NS!H(Vy;V_=*~O!AWPm3yk?Y)%s0&EatJJ;%XX-!W+6EOwV&#SdXPqt$?k# zYOZ@nZ$UaT8>t8+J=mn7P5^xZqrm$pkEp1@+6_=46`K6_m)Gk}3F$p1g(6Z+l-3Cr zkVt}dh1tZw-G1r5PRUq)=`>TLqQj?W3^lE2v>wJ9tDWy>-xSa*d(uKE02`V>1M8vM zVAIbblxJVHeJ4B8y4WGm+u1eL8-w7%za5=+8E>dxL0GkXm4PwCKU6WJ4)r`U`b>78 zyj?3MZa3Cb2QaFgMAs^lZ;IA{o!nou$w?_<*6^C;9&=7qy>~Gg5QI-Jk|9DyyoDP{ zs9_dgtEeZ2O|bL*S(!d>ZfD$b3az>mC68u`XX^@eIy}vt84MEP1c@d=oRL-Kl+Qrh?W_5*`stwFazxI$nV*0(Fv#$`KfC8L~Zcb?oI# zTmX&1(vjjdKhiTDZN5-vF;yA;rP+DL$ESd-^SgM(_S)Q%TV4{gt_an~9AN(6t!vY7 zJ*0gR`o^dUZ8T6*LI2Dkdn&b^9(L{#Jf!JJ0pq@DfGeMrCInZ4|1LC(w7?*XSFl4ND_@j@WJJ|CL7Nms%xD#${?g zwZ!}tT<7XOeVwLZ+>QQnmaq5|&V=8oTe;?aXA@FK-q<2gHwE`8ofHAA;|;78zHP+| zhDYOixeTwnYaV^Dsav1Ag%a>iWt86C=Kz13t|^MmjJBX^AV2)>seI@tC^NB4&HgpL z&br)Q)c1SRNJ7z=>84KzL}kftT~--CjlBBfqkk@lNR*Z1h5#5F-BA%ms=>MEe;tIz zl7*-w6N_`c6R+P6Mv9-^V5u~&k-q45=>W%=b{Y`s%-XbrZl8{4*R z+cs8gtk_wxZQHhOTPwD0+sVnUzg68j``p*j=G$!j8-329P?b$s$;k+ubh(tpoU&BW zjfW(}VIhE1r9#PI4zs(0M&a-(<^VsC{r#d?&!yK4=4cOz(XcuRR zn8%xPbN$rowfP0)DYrW9tqJi7u~0#O@!2g?*mtYDgo#^*5BStH|D}h~#U?^tD$%_M z?>XcgZw%-4@rz`syv{vVs#)&d$EUE+h16FpMO_L-=i0g{e7UP!O#FHC+vHH2P~ue% zaPLQHcXs$#u+n-p9Mx0?%!`Z;V7>00@8&AYkd^J#?lJ>^g0{?13Lok{>+G7m2FE)8 zgo_V#ti+v;PY|_eE48vGb`ErrcUOFZZm&Qv;$Jzg^Vc8{Kc%@#V-Z)jkB+rQa-;H| z_;?a*xpe{hulk^uOQ>Xfzr(Ec%3tgojHISp^h;p$%iCBxNNv32HEqr@IDurRYzh(I z=Hq~ty)G%_{$AKC=2ZkpTJ4A-8WQw@o~m!9Q`e$5Ma@Lq;JPmCMR;3Y8TdvuS{g$* z>U>r>fZ*gj#&-Qi6Z*;3pi>zlc%iA3gzde*Gf8H?&G#bB1x_y&Tg$Z-z!itC$yYa$nUhuQCtT(KdOe8r0|Q;O{a>~i5YS)zk&0Miitj4HYG*;JED)G73#YR zXmBjm%^L)ks(D6=f(D5B-e_uu(wQM0a{0@NWEGzzsY{95XDf^3fw>|hr|IVp!mLAe z$Ef{p&nt%%9~u4Cn9?yfJ*ted?pHBP3Lc z8`_0wV+>88%3P9YOyN{Qp-y(z3XX;kFO2d+pu|;9#XDBAf__ZR7Iq!p1cz_P(HaNA z#H^5J=&gbqaAWYLbc6PW>|aFjZI>InfIvF$s@@ve<)LByNjPJk@-LSJ*S$TDc;^U* zQL*#9G_A|5Qr{b@_PNhN;)lhC(^EC`=?=xw3PWOHJi(%bXsU*&q$aEQpeylM6)#-!U{hY>P9i7%2M)f<7 z$cF{eQ1%L;d0m?b#XJSSUY&s^^Fv`RVl@x0TdK6^G_$iBQMjGNM9e@$h z&6<$n0Jv}_t5P@h(naV43=b}IXVQdAwrsj9n!_EMz`Yn>IP@5#$25n$f^;=)uOfC* zEQ<;;SbXGBz&ew7CbiAHz7(PZvYvO~Q7Rih58kG4p(6A6=em7n;`@{O8U9&Hb2UCI zLh9EG=lU&?@g-$FO%r?%*<=TRRDKV_y6>awA%6pNY^>VJsQe+F`e=p2@l{FcS$X>4 zZhbcX$Hxh5fqL6u$~^!8pn+gIiIook;KW&0L&l@@nruskI`7Vs+8C`9hZ)3Eis2py z=A0*18P;ZtbvpC5YfdimP}&{-dK8)dI=(uz_vU=RAIN}fd(Lqdu9`Pak*&?^A;J<1 z+$kdg1ivG$x|he$ic9XdAPknbtv@~G6Gio^Jnuy~Zgg}9scVkJ6%+j6l5EM!Vx>zX z&?&}Dc)jK$l_%5U*kGkce5NzsQZ1FQRK(;qW|k?*A~A2+6HNQZH_oGO#iREo`_la~ zPs4yd>GC>l0{=Ss1Yf>n+w>HdJ;Kcxwkok)AJcIcojxFH@f)t8!dtNFq`46~;{YjY zBh1Ei#!3=4cis_VbqzV-o0069DwvcNeKL7wQ008mM{{#TD4IG7X{uQ7g{3r0+;=clb9)js6tPn!b zBB!Of__fe`y-hv%Xud9S{9{!$j4uMLzsD|5TdN2UEzmy&Pnk@qD3}r33{-5?=ElOp zN0E{0x|4WXdLj~ALZsS^X02KiKXqhr_Vzd!sGMz+dl9w6yJEMo<6!9%#7L+yaqa#< zjjdCLk!xhtd&B>!#FVZ9XLweg7RIRI##y+^48MT>q=t*U9$bS%-3#TfZg7gX2IGA9 z5eKbni*7UnU=IP|8YJSK>0u2rE;k)}#iP}cN58d6>NLSvbMP7_s%zHp0ixyXdEKRU{P6E z9RM_#VEDyfPT9ac4T5UV>_*Mh_*CRg)ELg(z^zZ>O5wfp zVUSEk3cl`nfMfa$fG_Piz2Mu9_J99Ak%E(Yxl^IFF*<)!@?$F(&}M58RPtrRM~yes zxC>S~OnU=n0P3go-)m;FxJnYM-2QM05qejqR)o>wl9GFICi7dBUae)rlGjYd>RGn; znK2Ko+zl89Bto5bQJMa0du6H0Np$apB3!k`S>^hsUBPH=ZhSswX|suF?26nc^$o_w z7+a|SuB19vK`MfcXLk?^`uL{ra5p{M7UW%k@UTchJ>B(7+s+*(YW7Mtom(9+Ir;|m zmIbPyUiTV3izi9EiY2%9n-|=~47E=;-rQ$D}Aq~tBZR|PZv2(%vc@QHTGO0{Q^q6o&>ROwpPf^b*_$8 zX@r!rn|zit=dqWslbyuvT6(!#1zzRi9Q)&g%AzP{w^PorTVOb9{tK=8IOr2Ko{$M9Dd7NB>IV9tjnCph6mA(nC3Eci5Ft z&dbZqKkmt!ZCk0hdwWn`h_L{V>Ea9?T|$OJS+NXKqg~tV7Y$C5XuO8(wT9c3$jsOH zL_NH^ExVf2S|e+%lf|PmjaAcGnx4?_~YFhqt*sJQ3V8RIWQA<6 zC*hy^#*0KWHHkO$P}07o>8=?lHgt|$SFe25p2+gL9Fz7&aBQN*4lby(7%UEN9H<+@ ziF8GRzwR=(!Ak+jQqh2uZp&jCI-j|Fuy%H+*Du-Bak(ZPm@^%F+_Eh#FK<$M!RrS^S@pf3Z|Bi9PQWrwS%+Dp!Wt-G z{7IJ%7M3r?`-kCE1xWn(1%Y7W(BcRJg%g+PGozHK?#F8x@K` zTK=38LG+MQpbt5ae_{;00g`^J%1qpa3J5`3J>vk{g3x9LN3Znp<_@oRw|V7j)~WXL zqh*4AG=@+!rpd4SY?_p=b&rv0y9n?)WZ!aI$Gn5>dtf1R*Ut8d+ zV(;th+q3Q4C)GxJ$-siY*`w z$99TF+Ni2hr!fg&V+1F*u1VJ!+%qUSroAmo%?v70VAF!PL2P_0Xac05vD^>_vqj7zF zomP*NWGq8;4geMU5mavDIFUJ#0R*HOG{t#u!!?nu#N3K#4JVy0e6sQfy zZ!)kENOOCwKgAlBWvKhsCSHP=KZ5`kL*nDqI_3X~tJ-Gf<(8XBZ=W{mC zzk6YAyWivkh+7TkhUvL1Xh0d>f23zyPj|mn;39f$^09t*o*6e+d$PptOHLv@HnJDk zAH-S~zse$7oV!`i_Oq@CKSF|lty2a1ay%2Yt^i0v+8%@K$Qy_JwCfRIpTe%J-HRIlX0flt@_c(cT2H5UD=8(H?25(A+p%o8MQ6@eqVaruMvsv zQ~y)Lg(9*5xBpV7>CWP!A7ZS#d0U4=Xp$;k}& z`AxZXRW>UIAB}AynP@6l7j~5SP(VxRIN-1jm48FAp%ts%YDdcb5RH_1G0#3NAbgy_ z&i8yhUWe_y|AN%)59&ZOZIY^u#qwwXEwZqeGtX%aiMy6Ry(1h?AbWetZ!@rc>htru ziy%WU=|{a1ix_xhWO-_yH7eU2*((yER=HnpP^}^7EI1ghU%Hot&J*2kmPfTjrR+ex z=ayFoe#VZRqTit=O(H}EcTN$vx0zJak$Vfdl8pb-Rt}6&qO#2Yt*so|e`#yLSO5qg z004|pFoWqwTV+hC;NJic7+D$)_=8z<{-Hsm8Y>eGdJH?C@ z?yh}ke6ifog`oz%XItfwsB$*5l-f75`2%dz3gW%tLzhu^F^B>&>05>1(_FuMl1o(W zol-8^`g4j`!y7f<9Y$$ch$c}rS22(LAqhbJg7wZ(*URH(;cwTO-v$veqvVq~wKj+H zP8a4s3???`*a%k&)~balS(2A2YnT|0^a!Q?X%0Wj6Mzp;7me2Qi!#nN=aCNHehTO^ zkx9FQm1@^8UZlsP&D3I(4>-E&J{Y1gLj16+~?-;5s_1UkNchd!IHz$ zZ+>J_7fO(3)_S@N?yoa6ELsinK1-3-O1}Ju2f&J7kYCO5lHW9mtyh zdqP?EcBA03)$EVU-<^&o%&*D!7(qNrG;TbO)E+1Ir|)NUo=Bd^)t@(cAP9?ZP&>kv z*kls^6a_;`7_CQ?bqk)yKT3e1Hu?|vqc<`6V5gu2Qmm-qOeS^-^t4+whX-tm+muj^XD5A; zCgmOcT_wWwZ5?_LkHN+qsOv1ZW=oe$*fCHzcQXx+oUjd?War+;ij5iFgQi0TfnTHL zdu-UI@nPllS&4|9#7nHNI2#N zxaT>+)?gT-o=}bpm(c>k&~ugA;lIC?0tTY?qHAGOWSq^dYJ3n^G6W%(1ppeF2PB>T zK$w9Q1L&V|g}FjcoX7QaJmxdN@_X`Akarbd2I2rqVA%VIrmI+r|RIKqHzCMN|?(S|6Ghr-HqWN|z^>zk%t^e4!=q#qH3 z9+wmsyUT&>a*1Fk%z-$C-diAw&hRhP(brL!*{8MU)MgO9-A9$igc;QWL*!$F6x<3%C6Bm`hJ#aGr^)Cs3Kwy3x9K$TPJUXt9 z)c`tGW8t!JWR%#Y<%P~B3UHc>@B-f)--dc*X?;OgyraQ)l}n4%?!1oiWj~iw&=uj( zdGhz4W~0T}Wz{pRK%1T;_W3Cv!KS;7qD~ha-gUBXzoQQt*zRu@1AKhSIPf8el2*4i zxep~NG;FxO*Lh+%8eqK>NxIFmL4LnmAZ$f{NTf1j^CqHr;-mLwsx$!BBmHR9b$^P) zYZ9tg{-X@CFQWz^DvFb`bY|k@VN_<2paLbx>l)j3>=VKTWbE>*#Mcc>1VN#NHBb5C2Z@L`&{}{QS|8^jyf*H>LPBxOOAuSrN+>#E=apDt_U7=`MFMGzDlqt^n z@N8U+p=yx`MA`+czGu;@qsGQ@i63?F*;r?VQ z%MEEAACFIQbz)D@x7&(uXYfkN`pwQZVm`m9yU$Vk`YQ_5OSnF{G3WGh zU|pG1iMeYZ0WZ_y_M8=xJV(Shs=EgtYwx&q@1=)2mr;tS!>4an)$%jG(69P(K=F1L zxIc!^q7r8YwB+ofmdksk6JtfRI3y`=N{Mdxrg4$-j0a}lPX>2T28=^ZH4Irao7+t^ zeIoAR!v1*2h=ylW-%^0@=9GoNZ6o`@2 zAm=G5L`dl;KM|Gt2Rk)hTb|!5L*3;^c1-wMPA{{pdyU0(UaY=_miVqM6{L>?oTS<>2h`4Nue7g)b_A_V z`c9iovwL}HB=n%(N-nrRAOp){bSUOXk2!Pgy}Maf6MRXQn*YdGJ?f|RliW#lK6Geg zSQSNPZg-)eOti}(g&e!s+v8xdD6`-ROM0c3RA zIgR~!es^L2zGU6%Vimc3MQxhq}Nxq7^xn}Bq$*D_{n!? zPGDO42NRMQAX;U=ADSTx<|O``aXQ|mWa0qY0{=j+;XjZg_$Scx-;e{o7R+e>H{^hA zu(FudYl#E`!K17003#XDcNLHI(Yi18OvXvDO5u;|c2KNF-$JX=)mowaE<4LJT2NIv z5c%*GSDi5YyTJIM_Rkp_(*>3>XH{_28xgWpY{)7nhR4Dwq?(|O;Dbk>Zyl$#KgEW5 zdPS$rcBGS~cm+8Q!g@PjJZ2g(QqcuRmy51a;S`lX+HBI6UI$sOiAPgxRI+&q3-kUe z54@tSBQ8Ij2)?m8_23b6D3&?o80b!+gvnrX{-NN7y>NVPhLZ8iH@yl2@eI`@AhQPX z#6FOWJp*h4V54CK-0PbwZkj&;0&hd}M@9o0do_2=dEuZMu z5{r&>wpA7WO?&A9r!Isq*UY%YIqDj2jvaD&b!Yk+7Imd4|5R47De;8xb4TVz zUaWrv92XTxfUWlEB2yq!@oDVF?5*uxOSx_1L)jjjghqM>BVG7&t?mz%F%2^0==f9L zGEW}JgulTGN9c@gz$kD4aVUpch`RDy9h24gXqy0#`a3el0lg=eqp`s95Q4J}^^h)H1}spkv!b&#}K3C!1WixeNmc3I&9#2UyCM(;Ab-vIm| zfGunu$miuo%h>*M)1OHgs&1r3KAsK`8HcG*7oKhejJDn2GdJS>;7?RBh!X3CaJ|-~ zOO4m!U~=Baz7Y_VRbb*8<$EJ&ng=ghiWJ%t`b^j_wPzWw22dUiSx}CV#jVP!-n6>; z^F%^)!i_Gp!QUrQIkZ1J&01WT$yP_K+Zs7bu?zU*^wR@AU9C|eIY-Lf&}i5bf7y30hra49&HQs-j?5bn)!V&*4$G(o~JJp zuXhAJ=ncClJ#G|0F8j7S3LarhSQEO*Wa2Yl>WZLH3fzzKHECN|;r6E19jnY$o4R1% z1Iqz=+tJdnO_FKaW`cRTvg-gwj#mKj-khB#WTS#c4L^1BM_?M&0?k=n7}*H#fTRvm zK-((X-z*!p26f$>{Zh27R#F+BgpRf<@zw>9>yK4a1&d+UV3wlxsS)OzL>frJc;?kn zt3YwQVmnL{ecSyO@d~`3ucr9gz)YToaI0C5u)teIFkf31S&X$VU7d9kKGZgEk140S zBF(o+uI)Fb{tp8X1ansY&5Hl8Xqof;A5i>Lw9I+^Z%}~93ub)%8x%n6I@$@Mgu-X4 zkyiNZ9%`n$Vsjek{wuO2HNUsKpn)c&X*U#L0vdyp@mGoG?EM8QI4>J`yL_8zj_&+%1y5#=(Sx(sw$!~*qWva-e0lV3qUlOuBk;E%SY+e^g2(B4vDa4wYL)xxius;N_RX$iot^#Mg|C9lns;!#ed0+qMls?}CGFuE1Vdkia33=r_T=*$&7^J-?=>%& zQtOst0-pXsg0-8cbi8gKa~x)V)maeFwtjKh4Mqsuji$~RJMbG6U+y=x9aHW@Lm9|* zm0q;q_z0_FBcr=`srwjs-Z@Mz+y);P-d$2^29H`!2Fmnzj&)q1@0SDM=_<;AXZo(ox_clV zswG0M2Kyz?9_F7y`PstpP@kv}snvdr2~2_si_!@>nWS#}7~LtHe@g>A#*sS*HM;zN zR(~3TX^wrcs<%*Wm+lXqKqBsSV=6|ZimQmktWXRGwEl5?sg~8;;+LZ(`2ausyYozl z3R@J8nSqeK>l~O;vM6~MU+G`^0_r@MP+4Z3lPj?I zx7`_5dD@~RwGIrm5uDI)d6(HIjqXCLD^3-l7N^IT_|_{g;CNJ~Sj3e0Um(iH`F}x_ z&HWz`5&S&#(@w$r-yi~25KJfhH;6#e5f%&jq!QJ&km!(n1xN_%&>{5C_$3~?5W>}Z z-?^e~(msRseI~#sf%M$Oq;hD$71U@NEEtb)uw4H8&!k{F_FS-_vv{u`-?Ms*r0x-RUcR-|tryVxv0cg8 z&{&d=fBpO$Am;Qm2O@EmSy;$8_=M0(7ysh%MgZhM_=vc?GJB5{lwAUJwey-uspT$- zzvDOz6puEjB?5Dm1XKJp2B=hKi-dn9EI+}!Z_kI4&rY8PF*P0y?=MvkefMyCaY3H? zz*sYpfUrhpPTrj5SScVqS%%+dN?GAgWIf+Lp|pos30_zSgr3<~E4>|eomg(mC!33O zFNJn$2RC>hq&?Q9`0o{XTSbv16s&PDGVpd#L5{dt8#9t^QF_2@zP&3$obJKSF~ZGD zAY)$Mhm~kVupY29ZNJWdq9=RpO^OUSA;95K1IVTUVn^!n?szNdjW{8%D|edp|HT<^L4%RlS1=>zN5lMb z6tXq|12(UlI$a8?OD9%~PQnjjZ~FC18KV}A-!b6~B(}Ql zGJsAQlqAHzBYm{Wg=2P&4M^mT7MG*Wv7Q@P-(^ac;k*|&@_wjp<$Wz^`H~!0ImlXy zp3$Ga6ofA7bCu6FCIE}Sk$JmBG9F{WzX}B`gQzd1sep}!F7&+f?;yY_ve?RocGlx}JH$g&T)Op{JBxQ$H&Y$8S$$9hZh zzh6-U)Gl4Ff!|zFn0i`_Xr$ReqTa~a-lHImo~nGv%H!9OK$}69MhK@5@9h_ZtEBpX z3mg-gaj3*BKv`ldzsoTZ&YoYWcd7_OZNf`K47!M8z9G<&d4C1E4yeR(%n1=)X|}@W z@?Lxa8eCgnN;Trv4+qf@Unxs@DQU3DQy8a=U5fL}`|sj1*jfYau+WDEDWbp5FQ_Zo zHnY(igK>tm-Su{z(W@svOnkFeEP9f~AqEXMtb1mL+wBdz@ht$+hfHk^V*yUX@ao66 z;ePD!a^5g-^?w#6uriO(9*JL1(~@)otc+ z6;0LxmTR{BxB>BOG^|GW5-t^eQP1&$R=H~u$xM=LOf^Ii2= zjD+q-EHSO27~pVCyjJfHaY%H{q_KUo#1!f~5OMDS4u%CX!>vLTlQf+L&^eD!KBeKP zv4Q}`CfaGfkKt+aDqwb$!;@2)RGej;KptZb+Wc#>M!tW2)net1W4+NP8-X_DAJ1!0 zWsn?@O?gN2&wl=Xm9`bfjzyscVf2$!1M*03AgZC3rR-x?#uu2xaL zcw(OC_0)%yx{~~6g27nQp2dp?<49TGq!<*$UJ&!PhNO}g=JYP7HnA9gJV)eDLRpaE zU0w_)7=)hqsmfd!5Pu}$Dm(s*YV6bJ?!)Gr28*G(5QjPL3qz-O$}=%%EIeJl9#>Al zDq|L#D8U-M;Z-zk>PEVcBW`Ef;0%vcWM8do-XN_jYR0wsut&`{6{(P!wg}HEUn70Q zzIembPW~b)&Fy-4yRNZuRdM!6AZWIFgN5I4m7}nFpq+srfpFGm8@#*G?Cj6sz7^!l zts4pcKcpa>Y;kqhrQt*0rf0>h{NBltt&#dKo##W=sQoKb%fYD1zTSNRBchkNqH{rM zXsE;QGQn{znHlVEh>pzTlAay-UuFwB6ofbvO)x0na#|7{T#ZHRvDu1pUfOhF5FGG~ z;0KgrAQ`hj=|Lk9Iv=QPv3gg>Go_1S4%>>#-mgyA74tR4?iu3)2!h=s$Nj(R!_X&+ z7Q!nwIdjEl^4_l{PTadr5xgV1qa?f(RUS8;=zkU*uVzw&?e_KQWCUA9z3au>K5B@k z={WEYOWOAteC1KGk+>(@zVX69;H8r)B2A%u%xI`C`e(imP!@wE4|{p$w1q$gt&b|^ zFkWy{`fu>6^=dchs<+%&A1)=?c@Uy4jD}xXtW`VS?o2=0Vy5pchul3b&b3wwmM^$) z@O8jJsUG8gYV!J`vda+z@fe5)lK!qzob=S;0l#h%xwt4){FUUvmImT9(Hk&L+=UTk z8X8L3%deSq1{$*`KbVp_p<+U5nW7b%d%fN!9w*b)82WWRs^EPi_IDS9?Y&`7qIWQ8yH-bi53i%wun=(Tju;)wlh zptxzHcMCk*#ml0Jq~YC8zvV=+o;A)F!EvK~ohAxXXc`V~V2<*(_n}*2Eg=OTY`DLV z@5q*EPxPWlgp{2F$yHCz{MF5u2lB+jhjqpn{%}&&aTO@fpao;p8_F#s{5QcpXra8r zr&>PVyFEs1H80u2Ue&~h*t$)$FQ&6%4Q4*u+y6!waDFmTCbYb>p~}-_z2eO`pl?EZ zb(1J?A#tveZAb)NT2;~hsu6~9COF_Tn`&~LK3b5A9=%GDWe-jS4RpdbNz{ETTd6&z zNjj*Sx%PECSaBchY#&ciO-X`DIZH+$Nsa@pCzM|C&=V?bogx{rRtk;*u-}@>Vxegd z=yD{U0w;VREM3X%11JdNxdRM@rt}CTj080qO>X6)g@djNK{H_|HCfHO4Mqlh)A8AX zd%UTyufmPST1w?TR!pq|a;s?dYwydyHi)`Id4$&rc@iZ+)lJ8rsd{ocRq5UQZ{v)G zO_i)n++@PWE4!<&^JXBsLT!2N-i+N&k4M87=jxov z&&*Mz5zY0Y*ywKh1OKwj)U{2vah^i;+p;g>yDN1_tJbNR$mzmpD+LJW!YvAUX8GjW zpb36_@r@9hHf0sx%P6f<>hdu%>_Z09O|{9I4s&KDJSg9XgFO(Y;Wfv4w}1bUs3ibx^wnkPjXtJ2swUNnY7cLt;xlxFn!Yv%oQE7iEh4r6 zWBac2b^}$9x40N|JF-<621J+*Fl{-Ni`cArDqIJQ;N>Alb7AWS*}n!Br~q^rY|MFy zu#AfcuvnA_UAi`BlWj!YEFb(JgL+Ajv;_Egd^IivdaE){GRgRl!@^PcYvax`#lf*RY7Gpvgo+DtAaa0_>K5+S<2D8D0s5!HXht&v|rkkD;7^4G2 zPHGkFwqHL3yRGHZsYP&^03l2ceM)cPf;^}e#`M|L-43MZ;n`P?5=@)=v~n?=>#97W z3~%ahy%syf0~$41K3`>q1Th#qI{?9r$MM^VUmz9%1F}|~fe7Ccm!1BaUaW0tgEHva8Gbx3O2kk@Iu>K- zm9(L&pExwrC!8-$ObYE7ifqGXG0NNaKCXDKSK+!zLbp)lc%K}Vrrn2BK31eR)xQ_* ztY8b1)#$&K^i@yxlR#T30t1~f&ZEzH{9d6?a%yk`gQ4d{0wbqod6;}5Jf0dYtfDU~ zy^3pi;tToexAJ?c)HBJ_eo5=bTl5GO=bk_NDd@~06~meQwB?R((&xA&7dK?`@t^#h zGsb_mNeoo@9RZgPf-fZ;;rJxqCFrnu-wv@Aoj4Y znA;KTTt{A=s=5jB#|dBX_=c!;R(crEwoFs9hlgdjE0U^7AqN+tS%V|+6Ck^E1v@jYO8k`rDpOAz}S%%eQml@yjd9rNXW^{{4NO3g2WhVJcSbr_js><3Ce9QWQ zW<4erewF@h+!={41si0s13!EM^=?pboU|b%v94*-+0wc!)>BI^4TE8qtQOo`px10@ z!&cDrj<4RKjQbb@nE725V&fh}Jlu!@0eO}v7h*8lhE%}xp(2P^R31}FxaaEk-{nIQ z$6x36-W1db00zc=Fzg-&3X1$(!mLhBxye8x3j-lTf7UDW&}o_a6u6tL zjhk+>xiS^))F>Sk)doTtteQz64Ow|U)w35k_P#5ZYtcHPr57~vqtWSxayAnkb#_TW zuSzFa)3MWRHQrTMZ9MfE%b4i?=vP27Z@S$SCk0p`=hkJ3_uh12d%>eC%_Ow?(}7)X zcB?Em!$@n35W4H=YuN`;I=h!vo;QoOFoFwtv(MW>KCt|k*@+v`J!{B*JjbOLHQH9*=?UG0!BG zn^4vC2FB}9hhMt&eO32w46Smx2&krniqLW{0#n(9nKIBr4~A`e@BG(H&MNz znKFG|_|u^bqe3p*Il)pP zc(6f5c*9z#OcaM-$>q*pM~-!Fp0;ql^vEO7tas9t33@03^a;Z2Kg63W!?$Mw&HllD z)Xh6xu0u>(K-zQyLsUL#=LpD*MtuonN<_Em;VYD~S;KF(LGr}nFk0e1Lx_CW=(Q~` z{TeF#uSAcxjCqIlGSF|SIJHFc28fN8-2y}bnc-b|$LaW5yaZDFgS-XN@LR%=j#j2z z2siMeoU#b|12BR_7Bx6l0v09EyZgg9#XHGUgLwI?Q}W2iA{w zM1bB@wMpR1H-B}W3{MQDS!{Zx_B(;PRcn4y!!PBNN7`k#(;-kl0c(*-P(ojqDRdVeA8L;ghf|ednnUGQTC- zF$%Wu6EAXW&;kY)Um6#8A=}RvBwkf~^@h`d?hwP;4atkgx5{D;6vvy85>2lxZc0l@ zeP=Y%NVd3*M9V3ulIR$el?^M7H2?>e?g@09l!Ws#$rCOL2YHmC}VUgg?=1kRky6?6?Y|Z*c^N zzwCBKha*wI7}z|h;+WPt0uc$F^Ig`1ow1b#U@1ues*aKLP>wkY!O~5aI%Zy`O5GfQ z|C|osNT$$Kt;O#gY?kVH?1*@u5!VRWztAm8qw=%WqhV9k{F0Jg4#V-c_Pr#I)FlYR z#5@X}{6Kf@mnnyztOg;ExG^~+V`gefD1LvYmUe9=v{7bX!#=qx+v66Acz8^+cgfE% zcwdeR5Vtw0Wp10H?N`YmNInEIQga`>7H&7X0L`-6ZLIa5Qnt0+eaC%i%^;$ieUKxi zaF41i3PbYf+n8EmOf&)NZfHuy7Bo4(e^hH~` zAyx++nip0%Vi_!|h32Vw;g0roS1B1mR?5q@=SiLb;c6JW?T4Dd*f)Zzp1loaSIwNJ zQJ9XWE8v8xi%pCL5gQ_!$_~(0vJ08Eaj?&6MM0p*6!Sq)Rmd66u|iMLOx!!3wie5= zv->_vJiYjRN&Tu;i6Q|<#)v#-_C|Y&)`?UN8%~rY|DrIdC+c>=6y&9oil|va)Bb7> zxgJrpMYq?V)@?rq5B#OL5Moi+hU-s)B$D7*(AQ z3SGxaOTjx^`>?(hKrE1(H5<9@03^>^uYk~zvHz}_C%Vy~yC7g}h2T_yGD$yHv*e#^ zaS1JsL@)22)?1f8PW#Y@3wr~ch!SYT;9_wR2Q`9d=+d!;jw`>v2YJDE~`j*e;53@To*1loOII_08{`ww?xh7zH?BKepajSq~PPAKAP#hG=bBIdj!ekbY?sq zWlg?XoBcF=fYsVk?vB3^!GE%bI=j8E;a+o4lk;2`2B@5OhtFVwA2Yf0RB>e|rt`rt zui0k1852)slJ>gSC9xD;SWE_IrCN%%X=XVp;vBPzC0r%`Zjle5P4Swwhx8aZ+He5{ z^k3nOb;BY!p~+eciYicG9z0&CcI! zZh$JlV`@%32%?f3oD$^V=VI=aWx#qJJU@^nCRCz%&%PfbjDM)Z-+#|)E#0Cgbgf&0FCxR74YZ6_Rr?3OppA-2@;Eg1uQi{b%^83CpcXK`X(;>g^ z{Acug|KKN? z!U*CxeiQW1*dTt4MHbahD|XkA$wB|R&iB62W@gah>92=(*;*VbSa-8l4%?|3zS8}bKGeShMQkzLQBT@*WX55 zD+m57rv<~Zq{CgG#uwOGh7@B~Q@ zn*gKYq2{BVd8yy+@(-aG>aE-P>5X+K<|NY2kKM8&$^-fjUfrNo51AI7g7eS|>VV5% zU;O@ojQsx(WNiL7WbFO}nID@7>@Jw$^l!*?ddZtYTbt5fZlU{Oz`{b{zvouxmKzNr zsiT6R@ig9a8LBQ^CLUIlc(X8ZnKsL7l2mTu+=!gN@0Yuo>xv4sV;QADPYlltzn1f- z)h!$f8k!caZ{PBACJmBAliM!EJx}Bi+tBm>rVi(A2n96L-5n@(6#3Wz01P{DVxin0 z-jv)3^ghC-VX!^`rOOy6H3Bp5SZd7Q;wG(SM|RhGcr!PIflvu67~GJI(E@hrcKYQH z;xw#~WM(^Ds^Gm#>Z(J!TtI@|0YBt54 zs@YJS*ecgZeiAnmjJ!9iOtiUHCH#=T7}S5NNPI%jr9;q((#^0<{QeKT^~iW@!5y*< z+=6fxS+Krj*8gGa9iubdx^2Jm+qP}nwr$(ClZtIsY}={i0FZnCAmP>b>>vov^xe%^{RXF~v ztrJSKf(>%kym8b@H#S=Gruu5WSAm!*BZG6x@gP0!>fi;_$3YdxHzIMp7g;aW`W=#p33GX zwiB77_4WEUEVZ!XuHIznaz)tYyXkKfPe2?7L@{&5yNM~uS8l!Dn!((ymXqcnMv51w z`=gW2#$oVqeRnyQyWs^h&R*s~`AX9TjK@#OiPAA}+tpEL*X*$5|wK={NOT#l99c9qsl9$7>CvqIVnH@X}b*#qPzjv z?-k-oRsx=;z7EkVQ#IPp_WfJx^`eOXa5m|Ga(3_M3>Gl2@n6pV*a81JX#SV8KLBNnxE@5y7+K5QF zLDF|A--A}gH!;8XWdqF~+RaB!m{o5S_LJUB4|+UnR7~U->*X#pSD1whQU|}jc_^$s zRFsbWuUztq^~M#tR8>m)9g`E55}{n!8sTtl@rvP}EKZT*&$HP%i3QI*$064?8P`Pe z7$E9Jm^#52Y3{^EB{?fDVOu~0T5ITx=(+BF7e@{A2xY-!skON|6L;9@3Rtn<(Hccl zeAqRr=?5`Zz(TbY^p6a0lzglFc!2WNvA+a#Kw4jhz%k8&M4Sjea#~MM-OZIYA5fpu zbmu6t5vsDqnsfRb6@4OznXA;>M)0K)Cr6;Xe=UMi6+6;Tq6?$jA?7OGm}O2)_4vBZ zFZ|M-FRgB>GzoC!bDgh991*fOSh$g9!<@y~rFe^%-da22@USG&kONEh3Z~1OMFdW- z|I`Uuu~FE{Ve_9c*U}73bzNR4&-$%B2dolQ20^6-J~0LR*+2C_IFUTd2EL7dNZ%9p?w*IoF`ZMG_v+IvD>>(5HkTJ>mDdB zY6FCcE@$zv==#*OUK=z@-xK|p;r;KgfEt@A-Y-r0v<4PBWo=N5a4=4OYA z#ngN{9)8xb|4`wlxM}r&(qs1@Ha!1tdc6KikN-Sr0s>io|C=6wBtTI9Fu`MLx|nIB z%^|lCO=qg7{&o8?!geF=D(5T^-3+a<(?*e&ISh+%d6qrxrTQM`!}gaB6HFkBM6BwD zW@0y}<+Mj5=X4gi3zX64URgrNHy5XLM$3?_zorFV)U075as>>rf3F_;FDI zkg{?-XWtTMJuh62A!zjN`lxrLDaq28nwFq`F~WmXXOU!h*Fq5}^Vc9G@1RjTdB7L> zNV*y57rn0P+*&-?4vV{l)NUez8`mHMBKZKDKL^AntlNLRUgAC`i!5t_y{F|S<7ss9ltynxQIGZBd&LN35}daj zVk4XCWURKE!JQ!E#^9Zjb#Sp*3)w&tNTq4l3L{^Z&#|MZ(wa(n7W&?88L}WA>3y3d zkPS3Vc5y?XiXx(V&cb^_j6NN#HXK&AVtaL>0#~x4RV7FI8{4>U_h%9)v3ZY6Sz+#y zexY;kAm0`;xe~6qwXiz1m}1>;>*&;BM)0oIsEtlCQlIfMBpI_e3tK=4U|w)UjrLdT z!uU)Qbjyzg9#6N+@TSdC9=H5C4!QYL$r zHw%{+9G%!cvx+#UK7V3in`?89V%gV~A}Vnv?pVB+cLONp6*q)TpT zp@)FRhQsxdlQnvld^18BLC|a9hg=v2i^$LWHE6{cKO+sU!3tLQ&W9 zEKLi%gf<6N*u;f@U@M{rbeEcDYb?c~`2&j5?3kZ*vb?yW$PCRBq?v|14TvpCT=A|% zdcWH|jY~!1iV^iwI_#~7!!9JVNSGPK6x#^Rcp@_n3^}OwyGzm&Br#uSFKUAeNH{Ls zZ$(=E`WH?etKhr3EJ)8ZxBY!po9N3GQ{T`9oO7S^d6!>mSLRk5ZS30`@`IN87*quf z&Mg|RPSWaM03cl|hPa0_aMlV;wz z^BybcTbuIvg=dC1y*;lHfx!GTy55vC7Lnm(GD4jEA8G`F-%h|PS_x2oRB()kl?-B5 zIqea4`$W%``9N_|fW5k^3%UC>ljz5`Lef-MZ17f3uq_qXy3%nE-Ja9_a8;oAFqRMi z!?ystp!yt+CqG6ybNW76`Lm68%wRsoIi8kaAXDj215r(f$KTCqP;AM>=^jR_b}xeY zbo2E=P(Z94zAM}+3(%lDV@5f_QTmbofUw*8qJi)1@pq(nn zg2G*RHXhG$hv;@u*3*L&y6FSfGk7c|F?;SqR&`o7YvC2Eq`t8ZW;*7DQ~_=8`fD<# zBoO|4ybdn-AZ8(8E7Rki_phE(9=|sz9Fiov@;23VOm^SzdUVR&`nX5g(H<;RElW@z zar|1S^l_&Wtr1bza3T0iCfp{cZ^uzmYo{VsDpE5|!%tdNYAeXo^%vx=nXxQ}`MAc0 z#wx?4wkIsoXlm|vcuhRd>AaCVpkA^}@Z`V}w2fEO$AQjz!f({?>P0m3c@A2E9&?9o}?Ct5xb_v&b3^gr%b>5|8Y% zx>iA2QxsBS;in(rRIRL6*viNY-dM(4_15H9xlHqjHX$767Mvau;=O~eQznZK<}7HZ zNPTavss|hI*Wo-8qnaacrV8x`J^yN1&@DEMn4;8+C$m3xXN48o$? z+!2Oi-D4y7)h+iS&BXrKv0l3F?(v`_LQFP_Y(H89$c@Mop#>g zxvcZiTHI$4Z^Ps)5=x*{8=NIhL&b;-0lyql6rH@LU@L>7Skp^CDUGXfE2>&u7u-kuKt)U1P|Txyk)@Wy6I6ASvvY)}^+8st@% zTk)dfFMu%pi%JJnT{NAU7_g`4A$Ms?4*l0ObdnLHwX}FaN>?z`WJvby)0$Zve6#4` z1wHB>{j2t1wq({i4;ey9r>V z9slfi<;0X-BA*v#G5&`EG29k^2e*Z5Z(YnV0AuEmtBvGUvh?Fa4SvQqw&OPxXR{K; zuce&{aPgYRPTt00(rzd)a!XQ9VmD9SO9A@X6R>fj6L+kJHaqQ`TI!$zQSs2;k!!TZ z@~njZXZSHRb_amINUo(;&s==PF+d5pue0S=NrneJmF=#~f;lk%4kWAEr_;spC_v*_ z;g+R7Gux3uYm6xqnd9?@%dss_NV{GF&AT!jtNrScLC~R8dR@~SAt0bI5;)xr$x(m* z>hf8fJEu*EUcvI6cR1|oTtgCk4q#151WWR-0E*gMCnI<5-|&;y-_!CC)X4-)t+GSsp%Y)_0%0qi*vJnWo6vuu$G?>=YO0GS0OlzG)_`+E z!At@J0;#cMkWKnSb&sWNAUFhgTq}T+JlpaGG#D?TrzEZIp#0FoU_)KUeL}eyMpcqr z5)o8Z_3>u_;>j-nJXE8f6nhd3VM0Ez6Pm@=BD`aH9>_Qwv!@da{A84Hv;p-gA1*0V zA75~5t8)uSZ|#;JJC*8?f2ws*kV_8U?zuGjT;exA&L|;XD2MmascSG^w-3KZg5-PL zk%T$|<5ygCmq9rXwR2iD2CNGOJvQzOkGtW~*JOcbX6>rEnZuDBoefgoBvrS3VHl(A zDa*kB`eNNw0XzY>^NpH6r}p?U_)I>q&=J760Y@JK?Uhz<>A@yoCAtIs$2Rz>;?@11 z&prseK&B8kfz*GdWB@2hugG;Ca$FAL2PvLr$mWGKI}>k?=L%|4En<3hVwQ}e;6~otVl^?QK}l_NNLttJeB#s0S8@-^%;3<6?TwB;r!(D0+& zF@(Zq!!95Y8tm~mpTv3G?U18?7MPdUPvvc8aRNqi6-7)?@e7^~Qrmy5YZ)q%Oz)K* z-rcKvS^L?reev~Sww|>2S{TtttiRwEA}I_vy`<8UeHA)?@w)mq5&&1O?vv6eF&WSg zN!H3Fin-U)i9A>Q4R1?e|C1x_W$)$~-H9d&arJlyxP~@iuRF8Sw=DJUBc>{G0oKl)av{pRMPAt zj*K#mkC9Hld`0WmCQ<=1FQ_*X-u&_r_&n0$^kS>Xg=REaT!Cf=VuGFFtBXWM{j*;z z8({~H(nb~C8)Uge-bAHcGVUOpbc)9p79P7x`}W)Ot5kwx+=Y#JqUdsWr4 zW9^iHnCzh5CduOb@8+{CPX;L%sQQ-Y4l*LZvXSS@*Et@hWITQv+L4NiRz}g4f~_?y zoFlaPlzOe$^G{VvQ=yQBkNjuLumlKjhLH(P78JHiavboNU`pei!1WH38J5czvIV@o z1C`A=INYVf@Pg$`tOPA)zt!$xH>;}7m(V<=CCi8Hz!%rb586sV#ZPY*GRdhlN(qTq&DFp4_y%CKsUr*T*V~TZ`ts#MP=y zp7I2^OLUoDHCixLTll_hHR>VzuW<9?7iekEn0U@Xh6ho2r+dCKAXa?_Y`@TdM>`UW ztG9pqqa)x2at(}=jc>#Lt0$Z2|8XDu|F`=P_^%!z`1zxaKxWvF`=G#>4(f*wppVPI zN8OHjtHv=kVLftB5|-sv#@^z08X3I$7caE1(!W|uLLYS@r;JI0MpA0>UF;rimTnO|1c-N6fodqiY5)+ddJ zm#jG&GK&q?PF@GwhxH_&<2@oY#_-e8M_9a;e50UELX!&aK$t#l>b>6^mWb2I;-KK#|Yn{+#o;y87u z?TRu+;7emPTH12F9w%x6beZ~|o~TUDA05lDhAR+~u6K3h@+l4BuFm{R!s(1$Hm?y< zekAnG{LSJlCWMM1ITIvDeR*MjRgH&%VD%F$pWpMoj8eZ66d5gWLZ&0Sk{wh}B&jFC z*mu1ehux?&YKW}vgx&njYkqPm5%4=!pCC9QF!@G_cn!azJ zJYflZH9`1F?Puug<^;%w6)Jw-AcDlbYs#%d$n8-L@TrS`4+X_~|M^P|PTD4A|iIdfIt=)hA!n6iKc{jif;62;_#wiC(Lae>%VH05k&29oF>u1?;a1)&i$kBMl@6uBBxlt z&xlZ;_Ddxk{aEL^3Ow%t*7^9(&)e_jW&VqC%UY}Zmk>Rok_Dou7wBg%eBjvgPgHGZ zCo#5ywD3-`tA~@JPRMCujbQy=o0=IUb*pc2JI^shWhTqp-3mVHud9;%CHYul)Q?t? zvs6csu=xh0|1w=vGx10%DoMym5&ZHqd_<-V9sV2qG)LDr_H<-;gw`YhEvexdhA-II z{hGT15hsB+wmh^SdSA>q(n(LX_JFBz!Og5#_{#?`@3yhN37N6iWfPAWA~cfYo}i&5 zNXnSYE%a<-n;@A|zEtx;U;=FBV9RVuuDY{FY5svVU3ccd8RJ(TS5a8uZ&~)+6b=}> z9@CSp1?baEUXs#?MJG@GgRpH;lh>1623R$2P5a zn=0Ta7sN5zArlX#mKAFiJN=3np`(%m^QMg+jP$m!bbTQ8qI>2PNV^R&+Mo4b$nO=V z04iY5kI?}|k^-ILtURSSm?p&=_0b+iM+Mx?ak^8)3T7e#JdLd-T`rI*MBO)GaL*C} zIdPVSDaX}q5WHS$eLaa*4{Cvl;I=tg9R5~CZ5Q1$@+X~K*wDu~*4iD~08UidzWAtn zuUITfrF4uJ7*W%-*to&MKRQp?jduf}kO5QIQ~TQkXHu!egjO7rYsLI(Pb{-;H@A?F z21n)v5%QI?ZeNOXH$84Lv97+593sLmj%4@L{=90Zr=%9fPU3u7v&iMmad>9|R-yN=i z|FJet?{DWP;cyMJ;=9P@2|xN}XX?7{V73W22?KPK+xAhz6;^Rk#FCMbSA13%eJ}6k z7(q-998?0Ajhxt zxO+h8xZGg304VOol!9b8W8)TLm{%G>)rL6SU8#+AzAn?=Lp48LIfcp%83h za$De_1-FMkh8qo&ir$Re`0=aoYdF=2xPNYZFf~HgYbAhpq+2IGLa;r7Nk*%8(f9!) zj^22BRUblvMTZvE3-vB|Ljo+pesQa$pV{piR3D_7X?IjvMgJCa7M;8!gPq&ABA6jk z=`|vqkcrO}NIzRpSgp0$M>Lo!%$AS&j5d=45>WIy9{OS=9nVqb8L(CphP}QUO06## zme)bL7r~sWmn&5WbWe7>eC&g;Dcm$afN1LzzybuDVyH9LQceB%B9Idg&${YI8TM`= zyt3i>_P%)>_=gau{|S5jckhJ3f8jq5^8doWs6ZCnzxW48?rl+-v|LhXb0q|WBH;>Zs(Uz_tf&`R6?ha)T z^31kExMS^S;ConIPosZr*QhJ{>`8x|yuF7`>i;w;Gl)2u!uEhS-%qmdNO&y6rO}a$ z?B3N~#BWA6xCP5cg>SAL2)9~hitFSvj@o;2N`G}RV*+I7>?-t59N%G05VvfUQ{IeE zbZa{gP+*M$KB5-XieHnQ_1U&u^*}|pX%7!f1a-CQ^T2v_Y|=S3{UusF^M~tF!{O#) zflvaus-{>KQNWfc70%~SB5c+{WV+1kL`iyXU^A1J|p9X2}atiSc1vRq5is2a z2xInOMXqz?Rc}LDCE1mEb!Ez2`$Ktm9V$v{zuP-m!Aus|C*Na1!HHMl8-ZCePPor+ zngkFsJzQ?D)@raAwLLW@nk!*)hkd5mY^Qa$TmV6ZYL-X4mnm!9PpvxTMW-gseU21F zYbA7v2B!oQq=<2#IB+OC`gc+;2Oom6JG5c9oXa0f{QnDXdDQ>kC~Kb7zxo&dAN11hmpZaYYNVYDbR8UvlLv8bVE{kI zLypYMoO+e~cDNJ2U4S|SIkw+k%gn{M$)ge9dPk9N_`=k2)_8G}LHYxbeQfbY*^_w? zzOxR%a{UD&kLRLhBsL}I_>G@FwAf7L@r8$J4N`KXCr7fLs0@-@O;rEp5>F$$V%0$^ z>*TM_mfo9M{5Qy94KeNFE)Hb8x2Cl5MCnk6)b4M=w?<7o_t$hem<5R}RnTTUwTuI< zq1)t+)R@Uu_8XiOkDx2r>)2QKqkm3V+1_VlU9l)(Y}1LVVnM1lFXw=f>M!g3l>)k0 z26kDU(S!4iauFB;B1T8ANS5XMU(O*&eG}a0`8~iv=@{wjA56~$-Uvv|2$uaeWRZGC zKyfr_F#DS<`MJ>ri`B)+rQcH4*1QuaH1PZDEw5CC%WITeHvUs5kfHNVWS^y~IA(Au zPBc`_zninWat>MIX$Gpg?@zV>*p3o?IAUg<}_Daf49Dh1G0V+x!!>1*pOq+iTSEp+{L#rmv?S|x6}nFVoQIv5OT@= zZrG3nsNYH=>%R+4h5E?t(VDHJt^T8G!-Xq?HOZi@H*=0749qvuh$4dF9$LaLCfYs7 z@G{Z@Z_Jwo);^o_Sf@h!$Lju#yAL&K4;s9Yh%>wdyBRqO#!PLEsv#Al&$aOR{Kx$sYUeab8VwwTtkgh4q-B#pmQ*Nwwj&pSF`73 z%D~a1H}A=4T(aE4>K2hVdiSp@8-`k#bTALU<+O1f`gO^S=$GE7)*J?bXJ(*jqj$uI zaBAkdg*ZC70h@j*ZUU!$#}UmJ3Uu#OR_(a4Qxp+oA&zlk5Uz#OJ9>=_@;zf>U#~M6BkbPJm3K)w>i@-supm4RX zRl?MWi#ag*TFRH9GPI;=2N`IK5G(In+S6djmTlkb^U$@C(Iy)J(Xue+1tAKLx(~k9-&Z z6*&Gs2h0DLz~Kb4*8UMVl;;m@XdxQIINQ_}5#_U#U? zKI-8Xv^BUS`qB(B0)fWJW8h#HtC=Ve?Ho%LLhQI}%WFOH9}z!_$<_k@l-#+@@LMFV ziVDv+?MH^fM1rD?L07ze&vG(Yhe>~^!ZBfaY>In3vIv(Ac_ig81_c)OPsv12V%%Jv z3lr4?AacTQMl_b_-4JOwHBwu9zD!8We@^y3yEugUdl}Tug~8(}Nw?zBO44^pVm?R8 zDk@;vI4F^-jc^Nog^bnQ0-4PuVi(w#jjS^t5J8Z~Q80X|`BOW%5XEirXt^^0ckY+D zW1Ye6q@w2?<^f!lz-R+j{BuPw|6b95?o3A@>+PRF8S2jsVd2e?D)<^Q4>8zqwF5kfl1dS#Uxq734yDpiBmaLN9GudC36PGW_kV&l%U z+%9C#`I$uB@rHE$#*%S1g0p-K0FE5@+u;v&)Io=OxVjKWnV7e7WaS$UpLru0y*cCc zXJU9167*2SBPQ3CYV|F(&3z5B)O!RF1ZAdBXT~FpR&3LsH?T|@Sb1NO`8(U(G*Frq zXo?cAPhVSsd0S@R$c#c_?+F?NjiiIul#iL}_9)ul-pqAcfN-%yd=%3PM%cR?XR;6; z8$8#VKt#U_0XL&L#zoir`IdjCqZSwm>*2^XKqtIVl10gF4PgtmYp(4PE27dM+qvrD z9aFeZ|Dn_MTdv)UbiPzQX`@3I7(oVF)DAM{nhyMROix7|oqEqd0JoP5V?t`Qo~%j; z;&5rjYq6~#HjYeab4c87+wcspsJ)K5EqB8kTTQFp+@ibPAN6#qw zi71~*s^YbmAo-M4znx5_mT@&{-yp4600dmb7Dp(_;$cWZ>@s4jpZIG&R3cO$MT=fg zw2#BiOZA$xE@Nyvh%yzD8pRNfTj`oaiJgMShg{OMPpBR?wjR1CcAwLp;MEK7q*HM$%g^ zm20GKsW;L*i}d*tck@*|QN=?V-N)IqkJdtYW&d(E#Qp69@584~=;e1Z_ynwfLs}Tn zEV1Uel1pSHG50|fZ`nM&T_9Vvn4|ii_#xquLfv=hL}d(^#9l!d!+5h$AaGP?RUdK~%yh{LB*dk`jXmp63R zV7=Z1vFBlV@uJp=*tZm)Mzl7-!c^XSGW8;I#4piYJZ`1=Yugg3!YzZ~ZwR)>+nwO8 z(S{O~P(c3eNIRCk?A!b~!&^hVphi8M*lR_k@ooKT#TT70$b`gXPd@i|Y?kz8LPGa# zv*yPg^0zIv4~@F}l_k@_eLw!Qqh7gj`*v`T3f{3^*Q2$y{l?agr2!2=&8dC{=T4}@ z)|~fkr#qv^!xIzeI7ZpXT7=*c{&h9G>Q(|d;XMj>l9!eZ?Iz7fXf^4Jxy!%i^>RiAz7P`K zoI2H4Hvo7(PTmI+lNRFS}$ToFN^dJ<5-E4 z%!yC)XhF(e+d=D8u2&@3MfH~{Y`?V?wL_e!t%g5pDQQ$0^(8=ILxzZEnTY_ z`lc3au_+%`^P7wmL?}|UbZRkBeQU0wu#4F1{ zB5Oj&pyhKU@We&0S!)2EHw+r0KWauQGpoS4OL}&a1AF27@JK$=HQJUc6;FJ4?=K=n zxKLi8=k3*v8(ZeP#-SN?yAe8 z+SM=b7}FIRy%anKPgoDgmHQ4uNJwjrBn$>jDlj66X!-1}+il%KtRM@l7HrSfwlU)i zs;KC;2C80OncqU$rAO#gR@p2KsKwc8BP*E!=}X_-Tn`FXvjQ6sR&;2U)eK#+)-<1` zqIYcEI~760FV&L;35EK9(@2cXs=#ymPgb<*e?lEN2#+PV>!yY$Y&v|#j|O0xS7>Z> z5Cpr87G%$OxAKUJ`bX|yr9(EBJ)`&RMh)yU%!R0)=K`c5vXin>Qu1p+bS zC*GZI3yd;@I)=!K!4_fPH4dv9dzA)nn&y@bi4o%n5{^W2E>VTU6rLs!NBP=3Rt7r= z9%zD%vMIpI2!}D{8`RMK?6=PM&wvQ_?i=pp_EB-_wC6X*wnjK6gf+v|JLA_}d$nk$ zJ`((E_Hj3qf@zRcb6yvm18Lt@-UioQjWF}2#u9mP2(xxX|#9Cal~l`6R`PYo)blF+qwXIN!MpsDy@_^*O~)je`ncKcvDgW_o-6E zw)pGLh+)1ygyyyQwoX8m)B&a)Izt~t08)1d2Qz`4oeKF?Xlc5kQdMuao@8EJb=o^N zb4;_TiYQo(p$J|uq3{wA-;xN1D;hqB0q|?Wc}`ES>CrA?O8-mLv4e=~rTgXUqj0ue zRzQBWas@m1TX$9MHsKuNgrnM;q0t_^F%{V7Wp+U1rGLmhzJvs zpIY9H4}uBrtT6q-mlxbFWWdV4bB9#4%;TQl;u0s?jA1)_*^ytvOj%$w{dcye_~gQ$ zE-08uNi;+02l2xLDm4e53!7TF;iaFv0A~+Eh@vB};nACz@lsLRO1&rlzLdE=Ocy{- z62v0kz6Ztkh?{iw7}2f>*+nGa&@_1rIyzRAzfPz#os<4WZBG>xMu37%oKI`BlLhxa zT7r0BO+2p`x%NZc)@_0~H#rPmhX4zW?mq6yo=)?ant=q{NZl3U#!**FP-TdlY?a(EP6*@$2 zFDpS*b*1~hww%N9>;>3;n$FFQT{N#Lbjcaos z**|&n5S?G^@3uoHY^#>X+^|i1$b)xwBiko~95yCD+3kz-6to#Id=1{{0_XFW1a%Ic z9aPO#cKNQ#u$f|8dtOdKyhlFuiUOwL2-ZU!w}yp!9onM>jt?v%+)aJ>r4edWnn;dI z6`$I3+Od=hRJWUfm>52*lQre@=^BoQ@drQJ5NnDKQ26+G5w(QE&(uyz@-NF!tX6;d1KKj%+X|YDf(APK zZFmD#a@};nypcY2i{1L^Eyle%e8ShiNUxX2eq+DL_gM!gxz~qzqRvi~Um&zo$rfp| zPmPoQTsc1_C;_#stAW&evc4bZhRYB#am`1n3lCcj3pcCJj{Jr38;$1_1f@kG@W-=x zu`$RLe4mc8{HqqYFVKb$UW~psBgLfHipXcF2ZpMcqQSkeXtnd9<#RLSH~P)6scsUo7`S zmB`5R(ba-yJJM@i6S_Yrgg@nrj_aU_Dk9U;@|9<<-_VA9RexQ&DxgP7a<87(*U@8R-sqaZB=nSh3FR;Pv z(lsWF$}9k-i484{6bZH-Nf~d4cMJ z5GlP782cj)d#0}uQ=O~(5-vwTGa5W;S6=OE@?ux!7OeO;it_%9A*qVCHHF-jqw%ht zx^uVl3*7a$kJNhrgD}q?p7mKRlG$_Z^|I}89iDdU`k9>xEz3!D<-EREc20bP_>@4N9QGtic=+9WqW^KIiXO3> zJ{h@Co#?wI(mDYNk&-}5SytHPNhPBvKVAquH-^jnrrACGlZS%CiZGaOW>iQLSOsgT zK-Gv?61<--$%BLf>1F1*6HTLK7w=@kS)5YPZO!F3P@yKy+IcolS5L%MnmseynKIV4;E0miKKnyAC4(!Gtbtr6d*To9g8~hAm=0D7L}pKTy=i%ZW@dc ziPow|>h57zbdX-fuS+j2^O~JPQf(jf;lULVY_!xwL4Pi)Zhm8tje!H@4={Yfsw#@f zDZvK)tw0IA;(Tg+Dv(6xbc`p2J~zD1e=G^hSH{h9!2(bPt9aqE8qflGomGYd6lB1G zL61FF(o&y~kl+OP81lq;4KllD?iwDLQu$_;D7PCLA_543v|z@MNB4o1!d8qy41$uR zeooBP$#8Fqya%QUyuZ(9)S>{a?#5ow)2k%>Rxb?pGlOM?2>Hl{wE?jIkD!a{YTKrP zHc8J*cwY(vI!Lnx9G==INMxP);Cgz9yaJkygz-@u;S!AgQd8Tj>e)3QSbnPxd%!0O z#3XMYNB2OH$R!YprXRnjt;Ot2j23*IJP9ht}vwW5_`V(LbNq10AykO$s!Mw;SJc*1hICL1N{nnRaCGz=4JcPWlH&#dM0C+0`@qEf zUN!lUX}tdMYvf>Bcs1Sg6!JmPKpIk?n|f5mL*>Bea}V)0O%ct_TyDRa}=$&DXQeWR=`ieGoAsy97 z9=~-9&7ziSFU@|jA`+ytbw0&Na8{WYB}!L$H6hD zekx_c7#x|OA?^tWby}#7PZgH$U|zN22=uN9pi0h_HlXV~_%^cA(36d4 z*KJp-9rE$id(nCr6qPWQXz4!H^1JanR1}|=eYBreK-6wC`dL21sE-Qkw$Gxe>$tu6 zrjIc65RW-hQM@D9Yeb=&J>z>xoPaBpOPk5de)eu9bw&dBo z{YZRLs9>u!B+0i_Eqh%09!Z3NB!BshC_*BBC`BOHx zs|#q(Q773fo?V^6TD%M`-w_K;=5cWPbV4c8w7njTB_I(#8dwF!n08l?B`>zf&C~)L z+P%QQ)D`ZM$w`m$`C9RLs;fj5o`hE1AGoTg?}^y>NWqx!OgbFRn@}>zTRzoLpLxD? zKAjCa4zkV|FqKNJ+*>Jm#E#nX&($4;dL^%`vhd#9BDltx@M zqAtyK`$gM-v+<3yfX?7@H$QEtl8y7h`j|HPQ3=W?&JGzCQCNT;aOnLKPEU^~gmqW4 zm*6IF*r`w??kTvzpJ`!&rk6`(Ot$(iK#y#@nDn!`@L&NS_sEC{E4C*>X|1cUDHd31 z>1;vP?veP-vk$x5=!K@$Zs{+3#G;Ie9@HSBBzT5)jA}6@Tofp+452uRs?vGi3q~;^ z3q>R(XNL^fAApBe7(Q86sr6@*@Pv#K>3BGlRZ}T-Y7)H-oR?#d)7NUdMhFxfXOJoq z?HjvSa}kGdE6Hinu8O^UjID31Gfy4_X(Ya@xUhgytG&lHHOy?fWUEy@ufFijLLADM zmNVWV4hG$P2%fR;_F_0*MTTu}-*7!09a>1t?HlHt{<58SeTRDM1yo#Y?n(z>zm z#WkuWig#ICmh4I|LP9y+>An0LbuR2U!fpp%f!R8}>eeB2=-=`) z=B6D4jpg#Tmy+ng1xhc$gn#ekuMMdVQ7lGjR3JXPS(T`a<{Xy)USGk~tM`mTWNUnA zr0C5U;we{cDy23v!{kQtVV=+RkPD&tSx&FzdjaPZ9R?DxF_ZfGprE%R$m2j(BBj9_ z^!$o%XkAie-jT#H=K(Abl`r&g)Cg&(U|#$ zsB6mKj=c>HxhT2pZ&M!I}+s>`$FkM>KJc%R>d3v8IZFGrC;_!g$lx={1wvXiVg2mr3 zH5b+S`#N}`lzl7R*h;ilb6er%V^w!9nDZtXM$;aJln2jww6_P@#P##f!OC5K-{IWu zf01qGeV3}$d!Z3Zx1XowgJueCKpC2%W;CJu)QppjA5ug|%Bz;-0e(9MS+o0=sUD6h z#^0wwcNb@oiFKzh{GAHdqzq1yy5nr(R&j~c$myV(eGcSK2NI+_zT+`s0CcHroe?lL zDD>Q4>wN+tjK}*zz@QhpeN}neOBlk^nIDkWBwy-v>$yAd6pZR(y}_RG4W+(KOx@vl zFm?n@!pCGVA456qSqVjNF7IbY84FqzK?gz`x#<@SWu(wdxC^oURn#`ZTOs+J$m${&p7+nhsLaA}O>d>P}EN%@fYw#4YWM?xamuW$s*}oO@ zJA8cc58$7;ldiptEX;XN92f~mi^yunGwj?E8U-RGeqQ?(9Vu9i;wQvzQLg4)Li%n>!u?wg!UpL z2kam#3SGd{C>*A}RG2t)y-qB_X0hotI%GM$gjeQ+VzAJ=WQSrxxH7e&T+GzQTJ6_Q z=1W%zVKKsY?iQn2qm$wmIB;i|hm%J^)f8-F@Y4|e+EzKD>kqQ^ ziF|CfLUF@WHhY}wXjmm4op3XC+B@wMlSMx$ctW=!#u|OyoF><%+zV&N1E7!wA^W60 z2io1-&!ixe{G5WuXX)P=5>Irw4N<`Cxu)fw#+tTI&WbVyG-|66Q16e#E7S4Qxqw&f zZ~ZgDU3|#1VLVleu9VxGUF=nCf34W;A0``5 zuXH{p2V>GOmtZ5;aTS{-QD-GP)+ox8V^(lB+KpatCwnRN?&v#U2M^ zu%<3PAhXVH5PjPu{-keKOvI42mbchQeQl5Rh%v@-q}gx?-O`Y>{vQB0K*+z|BF8S~ z#E!&&y8fT~L1L_*i|4iDQO=c`-7C9_czQZU>ra@<2Mo4D&za!8RmhwlavG??O~WMt zlWXr=VNXQ?B7;|EF(>XrLYu`}cA#+Fy6G20M~h@nVvn$7>?sdg>Bc>jo_!LN&OJnD zAj+pVLkfm%D%LLk$A4tq&!V5b~cotJiw-G0+4 zItp|iVpug%{GYJ4r<0XZdSVd#yh1r%HxrV*vKz%8+4zD&z6IJ%rIL7lSP0Mx4pBro zV-wQ|Se45(wK8&O%NQm>HK94mqZ?8LBdiEMdGacERNe$W_tuJVZIwNwQFU)91Jn&9 zW(|@=4AM!xXg>HP^E}My6pL89zN2nQg-#Qqw|+R3*+_sl2^~TyVEI?draPh(KT_Nu z-i9AmH&ndVf|3F)KU4~-^#-8lfX6TA*pdr|t-E&eR>?Wn*1C-VGvJ_EMa?p>%zDrQ zO64Z~e*zn>l;76a7XUioJt{9eVsCe{eg8#yJn23E2<<-_Lt_$$DN9xicR87sfux_O z+07Lo!_Td|lsW5)?DxLQ<@Hb_i{fH_mp#0oyntQY#9Eman*ks|^to(H z+pW@teBwD_zL|w6_$GGw5$un*C~sW=i>L=T3B4d~o>E5?RvXIg$edPh~qRy<3KC z5iG;B_D>C(K%$jr!&k-k>JeogG>iF52{a1)mxVSZ7oO;R+P&SX+rZqV?O+(tN!&BO z%R9brgvVe#hd3$Vs5}uA5(TTZh?4&ozc<-a&GHk$a`#an<$@{y6$<;BCqRpQMooF; zg`hcq4sM8F+>Co)4Z<7)7$FbZ7t2x~rJ3Fe0@o@D8fB-Jo%pXi2Na4+6BNdsI+>{< zi8K>R>_lKJJQoP;7fZ11h{HNJ?PLUtN2cs*=SKH$?k>d7am5hAGYY^TOOhZ>7*7i2 zTc+nhR;2FjC6EvKk%yo0b))E%lk?~>8E#R-7#%CnRe7%v9To# zo3FB82kFri+cE@`o6AAqoP}{%eqc<-yF)H=-KgFlzls`Op61F79u;eYzXNMHfusvh zc~MMxjqP>t z_5N$spcqj?V(1oBtH_XOn^TAAbV&`CsOyMbl6D6b3HC^bRqr1JK$vy0Odu$iEJoz; zq_(VsJz6_QL$PbW!}YAV#>&0j{d7-$5@5C@!)`3Y~83^HnS{y5&z2|oD9KEVo% z*Vq-MMDpZNLpPJ;?@r;aZ1{VC`FVV!_08GN)6?gqfKS~;O*G9P;!EIA;~=mtbDnj? zl@a9my^n?`b>u=_&ExcT3IyEt?2nV#^spQ30+Nw-MLEWI~_(D#a&Y>Yx<1ySMajJFZ!W)|=74_SJJ>)ac3p2yucN*taK0DgejHbi3SlhyIf^~S_%{aBg_6vz9c zh_G%f?9$NLPizfwMIH}RS{GET75WVd#mk046DXDgEnT2+5@vx63Di#@0OCB@V+A(E`UryE5ab4Holr zX8QQgW!TLf24MR=R4z8x6I2Ig{BqmK+Y2^M{g-Yv_6Pib)w5J!(6hi_6+qhrSuv5% zb2RQjo8iC+^!Mdf!A*AtKbhdo|#4rqI7Y<*dV69jLB!6uFG>lqseLqI{sj1{;B*Z36K~Pvm`uka-bizz*aalU8l# z79?3OtpiSUFhMErVXk)wDE4rNa#EUGFS=8A`P=3HUeoGt1H8exakw@EvmCEwS@2nv zw9r69KD0n1R><2@qg#+7SDEFxSL%gE9t?{PJbJozGckSr8uwm-&qoExK`vPqx|TH^9FAr|GRwO{s8Sa7e?d;WhKT% zeCh_JM5Ldq!b^!fN6#Z(^#%|T#@X>s#hNvb+tGVvf|y6g=0j{x2R-3~*JewPI@p24 z{*Rw_g`fsBD`Wo`STu$YypMZCdm6ysd*2~(j6Cb2g#&y)KJQ*3fS$JI31%nWKV|c| zfP>mLI39-WKa>RUQ5on|TB+}c{%Oj8HmxAUJ-#2+T2VF+q{b(5+Y!_GA7@EfDcB%1 zb3$1I>@kxZJTc7T8~LM{1qY(lQ5vHMFnjvHUj8ht-STKI6 zM-{)`)GGe3@mB2cj(B4qSZazlg4_Fc=^O5NPP~P=ptpv+^_x-! z7Djy zPMyo8s5)*H^~5oNW!hb(Pc9_=z?hO1J3p#yDXX$sl|hFem4dTSih;Ao5&2RIvC#}s zgsICX#!6W5!7_cAj3Uox@?d(6z!4C7g$b1f$>z_8Q@SfFG4X|Jl+)#Jy}Cg|4prct67CkJzw3h;!QdvQpLQAvPjpRTAKPNp0ci20yCm!DgU-62XV zcy3igIm(Mh=MAe1;j3^oLqd_=-Ks$bv$ijI7kcz#&~xKjz7XZ7yU|`}rN&jBG6#ErOGF{ylD3tot+E8oDWr-hrl52>K`_K$DE1w69%E z7wT|2>`(iFDrJmfHHj#taKnGJ2U=GQS6n&r1&xyF_?@EMHB4@KSo|kZ; zT~`kTxSHlEkN$Jz8|*aZXWK}=l8D~lA5T|PpriO@p_iF93dnREEXQ@5)a8f)FP#o~;|l9{zfVh1`+= z&sZ?vP~O2dMr`;ejvNtwPP_R&_Mmd7Rdu&#k-acJ^7LN;BF)ihyg5Rfo`6}XDc0MySZnlpp={@)A|e5 zdpqk4nSF0cTfFU z2=Pn&YIl&MR8}X!Q6tFx*_3#m2+~1*6;J!i_WrtaRr9{FeBa`GAAskN{VlTJiIo1C zrB(p;bD_Qz9er1z?;uMFiDLe_Loe1z6j7^KAQcJ>LArE7f8W>FIdq~8sdh4+5fwL3 z{A0gm@@;z+zS7x;_abtBcHXjlrIryT*XWjH)gPUqAQ7d~I96NE*whk46D{PLWxiJ| zUEiNIqJHRo4@ccvW7c0u`C^HI_R^bj&xB8jyxvO5LEB_*B2Zp&a}LNkDNV0Qxz$Tq zGyV?>UUOpxe|?ehCGKJ+px2*g1J9y5Ixp;e+;!x^aZEHruH>>aDLba5t-%gFhJ6dY zdZ0IBVdipG!X%w}m<+^&j586!xp9?vg6+%v%P4rIyqP z0;PA{3R>|(Mskc}3FJMa+-Rl7G{>_}NC^gP|NbcAW+4f3W-YpY`w=<5aRnh?ba{2E zCK-|jy$q8{h+u8v7f4KMexQa4aS{T$pS`=lXPMbA0F|T!NNM2$-4!xnnT|LxRh0L< zk>`FbEi4UVzOYU1Xlqf4g4yyzYN6#ZU=uL}9x!yEicz37#K=mA_OBiry_a-mc+w)5 z>!>rn@-+%5r&)<*^jMOIT%W(~p)41w5D)L0}qm^{Nve~&RjXa!J3FQi5Sr}7m!=to9XnU7j zejLqFm0ea_=&m6R54&D#l+iu({9vpoXtftd)99r^7y@o1|CVSX8^W_xRW)~dxhwVj zV~>3KS1Y2E5j+{aWVl-N)8`VPOs~rQ1ziC(T{e_K|s}(IdH5vSB(Kqy0p@e$K zvw2E!ZT3mM;nL zog`Y+C81KgkyZN?vkMNt-^w^LmjPT$4d|ZqlZS(pyWW8k4hTY(3Iv(vTnpnlwno`p zcEh~7#6?%|CbYBUXNe62v_7k}C<{9~cYoXJ*QQs>F?4?y=dIkN?lS+u=^pt-;4;Vj zAIAK6(CKAz@jG%w%0qG#9WlRJP%1%>FM^!%+g&%DU1R;QW84fe6}r4?wc)1*+Y7%} z-#}}4E$TmbLz$k^+oYQy?Yj8B;Q-CW=EGcGHEV1>%>?WNGyE;Rw(B$flf!T*IiKl6 zDw2@nwOEt;x}9T)knbHnfJ;5m0BFRusce+cO%dD$HsSx@T+c65Q1C-zX2oT`a29JaJLCYu`ILqw4SHeNyVvbz6Exs`0wrs$uDa2BDVL{mTL?p|;B|qgQ>0laW zf@H*LFr&5PV0ZlTR-Cpu(pjwsFGPqq1A7^kKTR z=SDsMX$PxnUDl4aUST)1RdiKvgw{2OA)1C8UbW#2j&f>4J@?lj=Od~QWt;~86VWuv zFLFPlc)0084EuaXxj-o3otOp=qRe6wjKr^Lwb5i0=b7s%va(~>jU<3+?bl~-#9m|G zw4SF{A4sjC+-D%qqVYehlKV(`QZp7)#oRuvkayg~`|i-)RU=M?GvWaF2VZ8_irn@GV~BcH+%}H^VM0)S$!x%g=(#OwY=#ZkXoq z?W2y(TK2%O(&}2nmLuhE5;W5nT@QoLgw(~q!icpTir&eGq*nd+Bc!YyN=T9ug;961 z6rfKky9t@e!@6#hQ-#oc9~Rei#m+Z1m;0qD3!-`1e3e~wTr6A7g#kV-tmnqjltj@$ zbrPX>ZhQT)dem(#{hEcUn|jCmXq$mog9eH96nmxMY@)>;dzWX`h^hYfO$-;jhCx;n zdC^e3#3Q-1fQAl`W_;PKQ_uu`;&5@nN$$1}qcW+f!rd=bv~Zq15HiS=_f(|2wHxt? zWw5I>8I}m0*HTYG-~3dR-{b{d75n|NFzWq_{CPytM8k`YJT89igB{n$e=jDetLIhw zpPP}c4;mDW_%~N=6-IX*J_Z>Ga_x;o%)?Ap3Na ztSarYZxkRUv@%-raaP6Y)@9_5)cwB_t-AFQC)$iNJzgNGz82j`>QVZnOiw68#1TQ;bHW^!+|ddr0?7w3Pz&&$Gxh$xZm#Um2AA3Arz*!~xGV zk`GPO5P{9lC4+^j$d|Bj{DUt+MI{skfgRNt)VZ0nB>u=ZI!; zq649<8GAG2)L%(fB}61DdZK=_Ze1y-SG)XnvMlYxMr2jtzN&hC_`tysM0HEp8%?B9 zIxaUL3ei$Nt+)^|K85vukBzBg#mQ}zj4avUwF?6k)5kWZ+Yg15?ajIa9&rtpMrJ_ z`{_z?w5ls@-x7`C8l#b&%z@lqS}F#~ugPsjA-T5oYdo3(K>I=eaz{!_Z%1aA@&f%TB7{ zE6M#m55RD+Vh~A00kLKSH|Gx1Ta24(Tt5VEd$cbWBYa^WOqZwB+~^eWAnW^$n7zB$ z{5J%pP8HkN7%-Wb#Zlq-3K3WYc}*G!P%vs2Tbq4^7E*f?MLgCpxkUHO1=C-q+j1@N zl`tOw?!`fitb4mG1*z1#XtJP_!kV6&vz&Zs*O^uTavrY6i`EUc?5C!V=gox0K&vB= z-a3pq4AJCqV?J2WM39L1E)vQ&!l<5|aCK^6(T#!Le0zmImPv{ zWncb?W=*!Hjg7;sbg7?okzp8~JeGfhr!Op(&>N6@=niMiR!nQyK_dJJ(yvC>FgE`W zs#Xh!p7r6}DauH??$jggklSB7Y~KgRsyO%9Zn6ZHT7?KAx8$VZ#YEZ*(7OL+DlP`+ z6VNFeCO7bLWnAUtE5Q<#W5wpAdHDb^d%sy?iaKEC(f%*J;lEDM^MKAf+zjPZ!Twi= zx20-$mP+%*vCq*v1)Bf?amV(Z(uev>oCWKNua;Zd!&sIuPnccUM}y+tG*=L>%XNJ% zn85|_uD(xw_){Hhf7Y=J2?Q8@F6=3k|8Lkw3N=(Jh-IhuPF#sH+rbVj`j+~51tY{k zT9UenJ?Few?2?z!cMEq@ZY=MeH5asXV=iUk+|IZh_r=;(A)pYa3EWb*K_4?@MqO+l zLt0>>?)r~X;=(P4SVq%E=*PMtei69_ENVZHdxl>I;0iY{xMMkr)7So)hV`&@_o7RY zKEC4y!L4Bz&@u@ED#A2B!R%KQu{?VyYAm%lCg}YtjF|Yve@^N*V=mvh^~Vwywmq)# zScSa;f9hBzcty?K{zk(sVPfC9*a)mY8SS? z&n~#zUdPC*Koo49Br6Tg`CN~>(V~5YxkpU=LH}dnwST>QLleE9VRrv=TtYV+CI$Gw zu61y1*0026pM;1sTBjIU21g|Y(|hCb^Y+TO5s>P~IXfErDWPBJ65(1iC0;pYpR6V8 z*7Ud~${saEaI7Box!t{X{z_X|u?Z4p`_0WI-9;OqIbuX+t*VH%kQ!6C%>7y0vKdqF z8r_&nzImGG3Xa`q=CefWyNeO@)Wgw#iCja*Q~?*9JZd6J>AdOc)8v#P=eo^v14!oh z!s@$Z*yZ6H;zp^?&CI7NasE}@ZeEN$TkX`-VcY2l&0U!XSB}LXbBn=1-7qIbIWzo4 z0G=>GZ9JXV5kpr}Fc-pTVtu-NAx&K2A+x2F5B{=+mm9#nY7E=6{l4)k$*@F><*l0l z4(-`>Q-aYaay4sB7T(Yo&%3D(Och2+*)+Yz#f zgh4uF4YwBWZF}Et_;g?o$`zmUAf?m^!=qI~Q|JdeJ^0AczDL@uwXs#zE%N7mg+&a~ zcJ;r3*VCGgQ&ktnIR5cs^zgyx*!Bbzda3Qjhjo~wZ^&|j4pQG7l5-5&Sgya*$qrB6 zn|}MNTjzey3G>Tt-pGw60k-7?Tfy;L1+=fJ5*NMq7&`EeuFU%vG!)?afb=Zbcba4( zI)J+c*>e7REp6J)gORZu(IO1vG%O$ZZIMRgTty59FveCX;q5Ev_2k`;c-hwR6V|nY zH~#GjoH8$=t%%>D&$snhFz3)B{}&Jd0`JH6)yLQ?NKD9$C|mNNP^fWdg=?5IHi}Wc zIJuJcas*&S6s|DI!%axSg}bfN#uH80i!~W#teemOWQjo3k9mS2R6_Q+TAcn-#*_$ofdOM032bGMPwF z-BKF2t^+8>4u!}oi(qhc7avBigLiSWMGQVKxcO!z!>VmE$rg;EU>r z%HcWqfA)}n3=mk3=ZmsgSrJkq&$3(sU+Aj+Aupp~tJ0>L;a)0$eAqG?|3RtCp8Vqb zPbpB$lq{62IH=Totzs*47T$uk$|KY%5vgMFNx18j`Bt}NeyO1hSO-n#0xzL&5W)Xr z>Db%m@SrLwA0d+-s%pZGCy*QGm{d_1iXedY*n{+i`MIxNU4yM%*o0#gkv9{g@_OjD z72GWMxF(vNu_Z2AEPm-W=}uwao8CqV4eo=a#gu zD6YTPvCYJli+W3@|4^pbPI^C!uzwM^=gmid(zrD0>6bCB6}Q{W_1VUwW-;&GZ(oO- zs;y&k%>SC$A)PBk{keGap2+v;ke)eBwNR%g>EL^@Mv_?g71a_I!h|-4BRqv2lv2w| zv~}3e`{!gUlm=4-G1)oGSY%$q{7I%jep|Ea=(HllQP9r+=|!T;oF3+!*Ntz2(1+@e zn5Q0=DdXd^HuN6?M`Nbuxb4nl;#;~-77M?W^Y?ZVKX_n=M~j=|Y*cG=*z|udgD}crMcx)I&y9N9P}mh2JBxoE+@(L(Dk>mm#inL#hVO z(sE0rkjTeiDMF1e;SX0Npng;A9PrSCbIWT~kv3*^I#TS*0_p5k=B7KPmo){F?QIE@I%oQ&^ zNuuL${DQMy^N8nLTF_b-Hq6w{&eCJse9=^~aGCD@2||Z*C$vG9YjO|a#)sY9Ne4FR zJM?#yk75B4I2`sAIydFOxNJy+Y=&vTyG-d;==dm0rhHet1a&Z_z`XXHzn zm?yI_HE-aVFlo^$1X7sg_vjkhzts6l?lD@bS^iRbF_vTiUPOWL5u^mk+Cf=i%W~@U z;MEg%W1TqD;C%|%NjiA zO2+;nC9wE_SH}UE852*erzIfFrfTqasMNf__o=;)fkU?o#WnJp*Znf;yj6P`gxEBd zK(HyMm?&FrQpIKQQJ>)rP!gBNk#c)2$QQG7-vqfmVQJVhb<;wQL92R}nVnbccLYCN zkGa|akZV^Y^2Ggvvp0ol$9$WkBddGQD_$lz_}N#}OFvYgnx72PSb0;tA3o6g=F3Wg7+7cl9VI;bBG*SY;DE1YLOQH)X$zaw7D5Js{}UP+`((b3v$p% zlpsouizyIirABZj-8dDlrgU%io%Hi}YDz2TF~z2vRGo3b*`Ql~cW2vcC4P{&?d_AO zX!6uB${ytVZ_%fbU;w#Ap8fknYJ_sGgY236IeJdfL=frX1|DDwN6&i{Qu36`d!P-_ zZY&h(S$fY|@>Pt+Uz(t9D)7h)3_<5fjswq;gUH_*O^5O^<-*wuyZiCzx`Y#;ZZCRo zTAZ_nHDkX@kSc3pbL{%FXW0hYZ!}@l!ylw8|BNgqdnYeym(}CfXQUlcT2F{rO?*5# zQlho*a_Ew#+CpMyKY2in@XBLFuYWS^fKXjp3u3Y4tIsN(5^F5D+yMcTK+vJ3I$JXS z6>e?e)-TFCp3d&wqwc!df!z2!FyY0a%|AL~VpZgv+=s4(wG-Uqo|wW9*Sk@t2=@15 zj`&DkS^7Qy!2bUBEuj-Yv*=l?QPg2G)?*xC*~Ln6VT_!)>9PfzsIjvxq{dDr2U?0<4{rmUx zYnVG&3N83=UX18`e9gysi%{f5JD${x&@=L}CN_^Gw-jYBJ<%cAv<3}f< zDm?lL-mp=Vyg<;?FC@9w0dTNdGYw$que(bX)Vt3mX7a0QDcA5K<7GZ8rZf#+Vu0kH zt>5hn`*0XhLz^y!NAu}S)5?7u{73;2rL>sZqAK&ayvbr4$7*LC3WnfItPG|M(gL57 zq|85!PyazX7zC@cid+m`yFob0w60leNSqM_%xZD=17q6HUWwEllP&>F{kw`FJd8lGgaYk5=^SAa-Wd`#T!8d=re>6TzR2 z_=%;NYJoW6{FMppIn-d-e6oh#0B8XGeB*_v7>h@zmKAkXm*UVsXM1vWAh+>#DJ($KUA)= zrQ$(7KB8hR3L?<)feJhpla@oA^he*&>IG(wj(jaaJqq!%WulVdu!!swb(A5(M5fLm3v9!s}A#!}|?=`bCC z$m0d1E0Ss5r6D=3q{41m8NS{_q1E(lM_N`OdpVD-`=;+_{)wrG>WK6p;-Evb3kM41 zKe+dV*3{L@m#sh)h6`4+e%=O-&0m>txzGI?+0Rcikt?@e^qia9;m!_r>RFHPp05 zsT>`h#XGl(BTQ#YraUS)nAAQxptUBl%q?#|5W~LMz|!%&rRWSw{!;b6ALMgdI%XZ)qvscTHJ;&PIxJAv{RH=l<@6RY9Tb-4WmSvKPclpl||>Es0i&6cfk&EUJ0 zItt>lLP{I7PEHLmYumq?<99b4NeE#}MqZ`m(ebF z7K}Vk&{36?$DZ~Jj~5Ec=KqQ_vRsT#ttBX$yBd~kPGK8%ou4pX;xbLUhyP%qgv^PY zHLP}hhA4W$Y`H`v@`E_3MP~8_KfE*rY%E!T?G1NgSY}=b6{y(*nUF&6vem+uviE!n zK+kz6bFP8suUAN@u4yAJ4v@NFVEuEdwq|B7Wf2+TTh}1b}j%_ zeZ_ae-EuVa8_5<=cZnZb9BfX?e+M|V;TJZCp``H=9W|xYYSEW@*8U5AgnH)dw*AIu z4I?z0wb0;0X1n)p+q#u)h4*GlX3)`c456?20s_1QwrpfA$S=pHt>(pN<#LH%bN`;# zLwNW4xmq=`@ z^+P|Y!fZlTgpkf|5a(7d?;AcV!{HR9#~R<-LSr*Z%5&^6XCO*L(PFrcH%w@JeWFUU@4Ht_WL|Cbh zDg-c6ytl;WeayRc>*_e@GVHC8M8HzR+bO*m0J6wgE>J!>pwFXp{1td(&=C`IdfCIn z6R`Dc$F#BbLP$5lJL<}gLrHo&?~6z~OOwJgZp;G#y)>Ipr@pJVy*ag_PUCvbZ@j$9 zd2_qXeRLl(jyn#y)pZRKQoweeMdj|sfu1Vxk`p2_qgCy5pQa27$l`TJHsXwRs-RbQ z-gGVKkz=t4Kv>U+r%*}eg28?|W&cFk#aBevO|2;>3mvDB8munDY_@asGt?zpDxL4F zpFfrA7xc$^=TO#*Y3}sCyVlKX`gf#z`FD-cuBgNaD3^SM(X&W?f2)VR_f>nq6ZURr z$-T_KvP&_;-yTa!*G3(J)>KViL!T2#buinjbig^OB3M5Ug{zW3BFW3J=M&@=f<$THukCQawY}lSNG`EBE}mc;D^%&dbYj>D{yBU%$pRq1hK5(ibp$~UG}On|ClB^ ze?Oo6G2;j3)ncMC#oUjEyB+DFoqUA-O?L0U_Hy|&RwjVARHPQaR!>elI?^5e7`RFW zlHWtJ0AVFQ@tZ#fM{(@S>j2J@1MUv-zaiP8ztMwQ%IW&g<$18?NEV#i**Pc~(4?us zPljg+^(t{Jc@w&Ozb0bm+spaYoiFWI!|KYvM?zS^#WhQC;RIL!9wLRnaUZBGb8uVz zpbAbX%opeRU9m&WEg6YndA4_>$Q&W$xh&c^Ek5OH@qLsOx1_RLueeg(eW+puR%x-_-h~CqNqv_cRb_0397rThPmoI|7 zJulDoBOO+E&Snj(CO~$8No%dV@j?xtlf@JS$;!!U#2VL~C>bXVGKfqzY@I zDy>HWX{Ou-dbL`|q~Vav8haBs_GVM>^byy+SiVLc1&1fAS~4OR+idzwcx8^sic98E zeToqdmC|9*qrElWor?1lXHf^nDk=OAw1%+7f8hmSP%Ey}`;Z=*9ZcfSVrhA-WwbC* z)Ro+DknZER24ZyJ1pAn5N=?D#VZ%EJ3pmzEc47mx-am3mjP1vG+x*FOi z3)*m8#@Xt2p!Yyx``wH>X13)<93e!A5;}BtI}2&?cqT&zY8SgF?0aBRH540{GK!?= z^$8HJDB(V_0_~2gK0XimwuZg z{dADiY8>7!F#wEIs;52$RxnX3<`f#|;J$f2tB7ykG&6ucp$rRN83E9}Nrl{&^nXfq zL{x5wDKLHmLPO5u%rFC__WVnZXpT9q=7SFb7hUE_u3ef$1!ix>y3$2dkswpnS3sPO9aODqN6LUOT z(>;?;7eD|yw^Yd1>CD_pU?i1K={At&)7`~{KcT0k_(`Rv7EcRrrHS*R%464CDMLd0J%jE8?x@C$18YLDAHT8x$+y9qL?&thf$F(7q{rN2Z-5^)+Y>QH+jkm z2*W_xEtzC+|I0Xj_FWC#*|4)J4CxF|C=jz1^7B!KdcbL>imf9Vlrf5G$` zy!Z8GukFfCJ+9=!5d>Z>5TVIC9E((E*A;M7S%VQl_3PufOrC+TVi^UUu!wW*C*}_ z8bWe*9`6i4_ODQh!NUlb!!02* zn_J%|K^J?t@=9K6>v-R^xF{r>RwF7_?F?%mJNKtz`ge_bTNnsHLX;p`ZimuLJ(zwL zaR2~UK&ZdzAr)_moW!)Yo|S44z26R_TPjOFEBnFp(8J{&fUFO0Bk?v6L9R-xSMi4s z11czrLmZn9c@;Ox!o@qom}MzqX7JtNzINOMCKXMne2q?=T!MJxv}tmcLnaV_uJptq zF5k*uXuH}#Pvm)<%^mdYs?GiWR+GYJ5@u&^vT>Lj5SOu@tdUnPu0_Vuv43e+j5 z2hv0Ef5;}+79NE9EH&+-9K17^lCq5TgOkjT&fFO*NGwKmt1!13VJ0A@*yEe=G^iTB zEPF3TsQ$MQ`2^8%gMZxja6H{PJ2P=ahj@78mGCD1nkVM9=Da2nk)d1mg2!p%`*K?P zzoJprnW3&^N%~c0tT7II;@I@ilF#Z3DHRvC6mrd@| z1(uwSTSPQehsP!*m0Vx49ewO(G#=?*wGGA$qfGKC_^*2FIYIJnq?A!~CJ|4&z&ppo zhXoYs>JuFu#JwOEroK$qOOsu-q&yL*0aYQn$JDbNEI4blgz^@n)1|-@#GrXbGirEz zIl1IrUEqUWGS*robzoR<4DV|o=u7gLsETjV1#*`3^T}yFqwddQqqL_!c%a9AtP?${ zCrXi$`q;Uk$cGfsT|eH!MEiuxkdWBJfti?~_xFyZ!0sVv^J}zw^oq5g&M6=Q0_fZq z^jzYjVnQgi48=x*JmwBWP%XvQPxU5Jm+HLz;C}r8kI*;)nx3?k4&)!fbj{4N; zW=VLN0-YnTcL$$gQNvh6T96i^H0QNCSxrulhECQBm6a`hJh|(IUh%kpoC&2_=3%ms-c{?)3f1&bgPqvJg*{!yt5e1UeSa|05Q00)N%~Y zmv9%i_t4MLGUZV+o4+Am!##BiTJE4;KdELx7q*|Bo+d%#4eUgn^P};#O?qHtIMM@M z8&1BW(MWxCdXnE5FbwFj4_$aPc70*YgrG-`&Y6M6ozpwCa`&upK!?S{5PLbq%#_xY|6Yd773FyXBcyxu4wBEEi7bU#d6Ps~) zA{^Gcj^mryu1ynau;in7EA3nCV4~DmaMPe|$0Hg-Q&ec(wBBS4|Rb!=l>66IwM9 z{P`FGnYzgW3qL(OeirC6J@c~YoC$|qf5O1QMBT+TNQ@(ohf-mme$$C@yh6YZe~)?PB~TOk4YXF({Q3t`*91QKi$?($_zTn$YOcZ_f{RB-&ZY_Serj zrbJ;UZk@t^mLb!NxLPHFw3m9szBT6Nf;i+R$JO$Lxpj`@kt|C>k4F!WBLK-&^Dxh9!a42P#(Mf1c&#GY0)o zP{p>UA2@kxySqXcTdi5#z}iH24WrIJTbrZ~Qep`e=> zFMq0cOn1pOJX|Et+9EP8zy2HW0+c~)PK~fxlbku&9szC&-D_#RwTG!@IT}UBplX9> zsh9R$B>QJzEc68Ox(X>EEN0YO_LlM_0#-vJu-m^Ymh5EQl%0uu8*CcnF7QyF#^D;P zlpABL6sD}+Z>^U@aoB5EivYXmS~@7OS?NMNgKh8rC57v(_g0)QMrS4#R4Z^GTFU@J zL^B(!(eWj~;d3O4<7VI2VR${)Hy!m+Nc35Bd$OiH`*C*lCzUx#jk*gW?md@SD5!zf z<+Rrn+pB$Jq{^2neQ4Y~i-DUGZtZ6o@NkXl>Y@Rs1MK(Yuz(4%DQj|3$A+w*$$!cV zKzix|Ud%K_Fu)W;(4Ni5bhNstAM>aQ8fY8ltyE}$f;n;3nQd2wE?7x_YqpUC7F8tb znExQuPICaC+~%afd9TM+G3=nFpiPF(ia#lqPXH>)|_k=F+H}+k6ou3?^N96c9s7+F~ zFG_jt8$qXhfIW79%p@%XKAV1slYleN+-jy22QSt(6g^S4vyd22w$nyMri72k{gzux z57qrB@NIe28Ms1rfL=)KxMu`a4>ZQ^aB~NZZ#v{pp4Up%?t5P&s*%ZXs)bPvS5tMQ z+SH|R#flUuyExz&=P8}YjpzLq{t>IaT!*l;ik6wnCFNQZUV~JY#Cslv|1Cd+n%prztw|(xMGD}JYT;q z%f@$7kE;9)MK%iG6!>ILi+sZzh#wFmP*qOFwsgV_W!`xVaKGzj)@y;HGS{K-7f|O& zn<0E!Z(yT%SSY6mIBvWe9+i9nfV8;jCriEM(WYdP&MjG>DSGSRG)6FU@}OczIE!tA zT?qSx=KZ5XcC6I#QzOU7GCOdQof~jMfRwaaWJ?MX93EUaOAn#_)YImZcD(28jpM+a z^k7uto^d|Wmi>-la!$#%yC>z-EW+@AtxJzxLd~lDqX#_9VMTleU--o!Fx=`KX6S&& zh+G?y2|y-WQ(~hN=2(0tcIkf63IccV-_a!kYa3ShYp zVEv}Z*1+O0%Hf>j?<2cP`(@*QPpP*9#%eBs(`g=(4}wV|2<7AM@;bGzBY489t}kos z5zr26q5 z>F9TdTbfX=5c_fS#8?uF8SvLc<1L-lMBO9bUa_{?A7W?KHAA({4yesx4Vfa`n}l$; zBH{$}pB+5wIuj){XbbT%*<9nN#eBt`u0CHm@Ag~&%_P1j%+-63$h35l#LOh8$lrk@ zf`zXxt+g1vh1uqo2NVxNv@Br9J_++EuB1xD`6C|!xFHOT*>pE>0Px0Ka7wF&dUqP; zO5Bc~dYbqkIoqnYOdeRgN;~w6ro}nBY%|&#B4j_h^Q^9FhXuB1iOdZ+_40gUzCg8Z zMZvWe8h5BJUGN?i)$+2=_9uj@`CU}?bB?Vswy+S{M@n9li7PoXpKG>HJAgr`ADAc0 zh!h!-#9k_PQyq$dLtke&-?3WTeeTh=7kcErP+&2wG11b<>0~i2sw*e9BH~1gRh8R+ zI=?;lK*`lg)LMk(BN&NF!iq*g-Lf(^*6cAy%UAL06FKbWC z-omAjv+tbtVHc92-Y(!qY+-%Bx43Z4f(T%?t*?>8TVItc@;oOFET`5+14C5TY(fz; z!`H7z1>QWySRxZp2BmKo&L%CX)>M(fTCeV=be?GKM-Jr>L(D5i0vglh#_F#xyeO7S z>p(ln)~`OyC4?nZ+bPd~_5mU()jhMf@X=vtXXc{Br}}DkJ-g-w5M?D4rDc0aHNAp# zbqBHI)Lr(E%ktDn+h}C9GSBUR^)RL3Jb9TbyU=SOMGK55!TI2fBM&>cpI8+|p)b3{ zGLHqg-~)V(WWs+w=VEsriPdN_+m^4JjdDguqG`Yl*<^onVr3KLT;Kt#Eu1q2gW_=T z>p5oxVA7@c<|nHG#r#~$D<_@Xjd@lue? zWm@t6-3HsMZeju<1;)BFm42R5J0@gYX*l0R!Rm&^QWX7NvS!Ed^buR~Ty31s;&7+% zzc=nnADgojLhbli&nshVTn2qf3D*XWN=M5~Z-IG2ahi@S%wK+Q=Pd#eJrk5oVs06( zbEDjn(Iyj84;0v-#B*i%3MJn{uIE~?S%R4`{8bDq&R*)%br}94ae#Sk!Mf|Ha$}Ei z+7wTh#}k>|z{$#~M{N=g&VaxxadXq^3axPg`!ge^+RT~i2=gE&k1IZf$<&$^-RvtZvSI5uJbQ>uAEf>IBc%&yDrP+Cs{8C>H$Gfil7VpcAgKt9 zU1PnH8w6f@pAp)}$(l@59nRqxDM#V*F(b6UJ5(mT8Si%Pm4WOFq~>q|RJH_CqmJ=} z%(o-t?rBDEr79SaN69E=YRq(pZLO^$`v2z^RXqmv@D9;^=J)@b@^hZ)Abjq9wGUH$ zt&ck-lmdlaNnUIv`V;@MlH|v`d^Z=jnSsxU3MLU!uME%$>pYF7br)VUQ%UYby{bYt z(`+;UEl!L#xIgDLtJ<5ah6DmICP5IMPwEUGg|`X?x`*ztw0?#Jbk zVQI}ML$v320B%*ku0?=CTPEBZ8>pE$DLQwClI5^OG1&R*UW}+x``+S`HWp683M3|< zick?N-%XBiRv}Gy`T-MQZ zcK%dAhZ3lWb5j}9I%*)i(PzO-@)$s0bG3JkP805|P$WX{2iE)KenVj?w_Ey{=WboS zk}-=y?jecpBCzwJ+h>Mz4le-umpxhj)NbSceCh1wTioaq?acE}!KKcVH@)$eKH~7_ z$nH$S-rB*khz%nafvyL30i2vmktmjduAK*VrCRmxjHrcR6!@Z_mQh)}c90)F?K#dX zeo66R>w%qrU(|0Y+DelM?_SmOE(oKW2va8(`v<<#EdnGl)E%7h;ad-YHsD@(!mHC# z@}9{Ta+fpap=0@YCvY?nCzxd{(Xb3J^9$*A?9LyCMeihWKVIW_y!zJ@MIb1fQL~B` zJU2TWk)*}gS?QLnW7w+OIHm)4L~Rue zZ+nZrcfNkX+KD4BvuCIrOcxx!5K=3m}iAB7XF}`WuHOLtzHK0_2Zp}GCq07=L zF<4u#%?pp3sK>J8jQ|WM>%j9Qs`XHuhuC!^%TTlpXIVP4r-Q}7%369EN`nhpFewHi zOY~^CUR>HIT+@M--ad-?HDu(K0dX=YU7wy6->EHKr2C+1gikE^-@(8>J2rvpBwak% zZV`u4P0AGfvrZ^s&yAZ~d5MIax%mv5rX36gRme!Lg>Q6bRn~3#r0o*odX9D-LNBI9 zBHHs6?r1V~1|{wqGQ8jD{N`&Ht47!jI`OMm=u{~W%gAV9a_*vL?N90`{C*AGYeA2Y zl9K~-zqTyCL1g=4?U(rCTqF;V|9ZRTfke}PKP9|~HT&t&=lptQfVC|f&SeC=`qpgPlj&pAFC z;262SEl3`0{$!E8tUv?Dk#g+B>+2=Qt$jV~K1w^`@S7$`(BJ{velR=l;8#6r9hZ}U zUXspuXSsm9qWa4`fM?kv_T*WCVPvl2X{EwLJC^09gKMc?w1xU}PxmBnp(eJ5;XI1s zamb@DueI_Nv;xZ;uWdg$SuMgLnTM~)*1su2-Vi%J4B%3FG>6At2U{x&m7QJ7En9DA zU2Ex3Q$%E2xjg~e3y&c`(A^3B)9S*E06E;(5nWr1xg#-86|bbqb#qzL5DW;<4f1w# z7D8ALM2a^~@{%Knc7$v;nh(viPszpLYpo@pcB~{7R@fdF{&NjNW|4krh&XXyjGq7U z5=+%kSl=; z*}b?lb1pl0Dzo?J*Mm%n2o%9025rZR)Wys32OTgo*UR)p#bMJzR z`TI^=)9W#G&QuN(|)0=3$St zX1%3z4%U}sT?IO9?vle`Y*$>^iWNdo?qr!Gd@e`68x@>5wRky{jKcHo)Vv5KnJd@H zxX3rnEAE4ZQZaN2n*y(bZ4VZ+V^7{}2=7+^mno}Oqj!d2N3Uk?kt{HwmlSq16-z&9 zK7AytjIn5$KyTOCM9zZcuY&D<6UfOJ@iI}lwqw`1Q%oyWvnko^X@z;UqGG9K#x54g zH3(^CTMc+zhhxOHPV;nE2Yl>k0z1!=!^S!tvBM%`uFdWXYoU{(b|8!6fQ@B;s=>yt zl3*q|W#GF~KVYZ=U{GYMj+Zcy7r2SkGW1DIMf&XQ%0|UjDq`q#V9lHroxhxiq;Sc@ z2**d&{4>)zw%N_bSqqSg#a8;YeN@kK7veTdo*jjt%f|0|QmuJM#l?+B6>tzMtU(ig zolAF&RA|;4tU7c?K78%)oj684r-WZ9c zdHu;mB4AdBw7`$#QHMW$x~8_@iFowd##|Z6HF=poZ$%|u_*($u9y+HKRs}C|D12NF zc*lP7%6l}b(a$RMqlK$pk$VZ%ZCt7_d)6dwVN=i*qRc9IcV-}sF_ z#xTTOEF7H#As;!3$I&qZ!CC{JraF9UnO@Ln{MNl;CaM9!VcWc_b)KI3c61&}4ml31 z*gXkJV6!K_%`F#wCYF#CsQ*JWP0_KXEB8#dl&LG%p)CBQcDrlFS|TY{u387Iqd7FI zQq&WS;VLGdJlETi@jy@-O_8-f|1rhIZT{(0{LxJns2L!k;jo z#&yvhN?4rZ!~5L-7p-Nl@>HE1$j{wHf9;Aa$Arw1Gw_RdSNqS3^-1o%qWp$ke#hb5 z!P)T_Wou3^JB^)putx=CvDt??vvJB9NfA;$IvU0!Wwfjz-VmY>1$zEeSgn=AJg8h(>lz(Z+ZmLe;64@T5N zmEd6h|KuZ>rpgOB)+Zn+Dg|>)m}N1^UR)fUH}@wrqsgB^UL!+9BCLD(^m5M2u6}otM&J#$lqEhRSpa~y6t2g^RJqWXTC2%jnZ=Hjp)BkU7A9-$Wc%kv9HWK$Uo7`1fJb$xQnIid#DE6Qc~cJa_AR}-8@u3$Yxt`=LH%Bh z*obWc8d{I*pN%>Po??Ah5~9~#Gr_j^E&jA1Vdjlx9hkhy(RmO+;=kuwNJXF~?5 z48VRRP3PCIqW1Ab6J!MW=$=-i7bVy`+9hWnRepkV+kijv7^Ip5fgOQOUeq1`&m?JE zD0+FrmP(=lC?7PHWb`l>;ZaEls2>jKAu|zKI1)L34SdNsw*&m{4|uQ-n|>d+Y=o-D z&Tb?t;C$WsO}#X*1=&N*5h%2b3j?M(OkNNpAw1PigtWszya14ZZ%mTAqzuQ1(@(mq z3ZYXnLIGHI6UD@Z=T>8JaS$yF@#i|{wlR^r8|9~6>{3mfq!!(Wb51w0`<)I8x_QwG z>FXP~Dg}StX)}=iyPEC!`nvcgpjIu-Gw_0A9F`~42;nNnM1=>)mO00|2?Ogzf4e02 z_k-&TO%Rt0mu)#VdTK*D>(d*ZEe~PFnfwQB61if*(f0ohZ?~z^3w4Ro-tB#wu?pGO zBX=xx=3V-*Z_o*k?y1rI%2xw3&>pv1TZVDh?Ythn>||sKXkl*a^oJ_pCQ@-1_la`?B{La8~!75`r-7 z3S0SH+_SkS-Ev>lKydnARu61^nYp$%Cj@+hM)-lBX?D@e9{lU)n7-HjSpYX7_zuX; z=(y2Wh4m#hY5Fo5kQviPxV}8gs0AxEPjU2oRM>~mH$m@OqaAR`HMp22z@Ug(fHLOr zusJK*wAyGiHJ&RV_G|?azXPR*|2#{Pec0vGpFhR0g1|LKcv1X#+A1ZM-oMRw1Y(6FCPksj@gA%GIjq8Vn-tBa4O}!1gt0KUo~4u{n`SJNM1!7#pSEo_c<{Z%9*$`f}=c?OR( zZ=cbMN@?~VX>xmpqHgMFF4>(%K#CViRUfUxdcU;eC;LddyEg8PB0%q0ALD{EQ;O~C zKGW6c`#YR?^QieBTaAQ^8hNKDv8=8d$%$BT;x|{2gxxN1FSyl{lCr#tzJdaIWGWF- z7D9ZsH2){8YTe2&t~#2m88F`U%%~sr8S@ypUtDXWvg)2qFIQGe9ZTLKS$Z0+PE7=a zO=*Rb0@^fSFv~I=phNZZP3P~(b5s;y(h$ciH`q}OTSE8aXOy%D6GXX_jdE(vcE(7E zY1>!UGIzt?y5x@b0HnmT9%{Et1Sg&JrC(GswB!)6^8Pk7J?$bw4uuRyfB4Nr#*OF8 z=PZC5g89T_wwnzQRH#GCrCY9jwM2MoYT9^N$2A!Rz-sNIE<2Sj4p_jVJL+qIg3S;W zIj%9I+6I%Z9928Xel!9*zI~2r_<;tY`|?ud21T@LLvcOu3@@ne|eYoTm!B&S%uXn#%>`U%PZX&~AaJUygbgzs+C<$~zMQ zal*V6gF@SFF8+MbY|uUZvPv+5mEeRgJ8S!2qojh?|8XBub?-ho$R`SoeIguIReNFg z7d;CFw{E^|MgZ{kxN44>8c>65c~Y&rA02aYXAQ@;x8JenPrAi?^2T|NBKRnct3mrI z1+Cifh1gq~z_glHZ0Ec1szPe*E;kimUMHi$&n#u&pK25he!A1Wt4qX(P}9+e=*bD7m=H3h}rS3dui`G^ z>;CZ{a)9T*XQO^7wevJzCfpxhjmtpwk&;~lx5cRuG+}gcl@+b$e^}+ZjiiYvqCPb4 zW#G2$0yJi!WJfF_@RWvBE0_ZhVd>`-5QZ*z zpZ$Dq^b!}O62fBF;~o~yQ}*`HqHd+ZK;ehv%cVydO)&g>Qzn${7^X|kLuB}OVRV&x z@&rzllbURan2w$xV1`z#uuI2&H_!5C>1D#f2C8m93g<&I?SGglkv}!Jkl&yX#S9~@ zDhrtfRTyJwen+xFVwb%=eUPY!Yrbi2U^T3uZ@WN$(9srGA`YJj=;CC*qo-^m%GQH_ zg!{E;ARP_~hc_@*HefWMtlL#*j~QDQ9eeNrBjXuik}yq=2J3jUsiQCQqf;Z8mmvT} z1G>HU{7i4x%f=z8OXB@876AgSHF&Ds(~T!2%q#^U+FlR>LRebP{#|p1-W1Q~nVpzV z1^d|Vt=Be-?9LcNmm1u!+~4vTLZGP9-&rj%yWQ8HJu z4@ou0x)ozztrx#xWmy~d>*={sE{woP(E`f=p(d5#A9N$J7SN_4s>S7{NR zbOzMeC1x=>2!pWgi%qL5VSw(nsmfLZb(I zl50cM<~ut}7+TksL3bQ|s7UXQRS)fUwu`H+_qgf7saMm{h~-_>0l1xE1P;^UFsuy- zo~9u9w=BQjcAKwi-@g_TjVtHLNxgbhC-2v2nZgP>dfNELHK^C``pVqhG$azTYN zZ!+VN1v z05+>k{~kAcGR>`|A8Ms|>5`)IfYFkaV55h$`Lym-V9kWA76HJp5>o1NaHz8Ld%W4W z;%l<9O|a!_MB9LeOy=mS$AH4^zv+#UHCwNfYY9Y`ABRZDA0d@1vzvVPi)g9o&9h9u zNr{5S2jU*Y;KFEM#YtScx@Scr&a5F%96^R#253ld7ShG!1NhHWK`1iFV&(!R<4|p{Jefq%6tEA+zfy7KWv>Or4B7uOkHPexh!vRjBA$r`-!+hoR zHS9lg9{vM}5K4b8DSNkuG+#Mktm+N-dz=486#fBO+Mgnb8V`p452+jR%NWk4l)DDV zuzaZj6EGSVyp~DV;zM(8#C2V{RoX@)n0u`iUJciQKw;y)!X70 z;45bJoSxGbdZ{i9Y~>5u@a-lMgy`?6%(uH~(gPlbf!w3Y15KgUvqrDcniLLz!+XR!9))7b* zBIkqyX}Lk-Zta!=RKDNgO~%cJP)e$|I}Eml*&!Mcb!IBqmGkAS4_HnEODhfzUhj{} ztJ>=2)-r)ofiR#tTkD3~8;5wh`{`(xp&Wcl((#V`*AY?cxx8~Kes_#gO0pd7=|P~e zX2#}84hx6S;br_dgd9(T8+I{oHLVzA+94>?@XH$O2~;P=#u!iLLd2N`Q&7;AvnVMx z$t8cZLo=yuNWKHZhd_UHEG1uaP3neYIOgR6* zg`Djbcwcps(5K`l`>3A-9CGMD=fvu4@i;l#QoOu=nlhTZJ?L8eO4hS}4wdg0D9 zUVRw%Hs)gCw>w~@>P`0CzrSy8YninM6n!(=(rBYs_}!zJiIh(oIWaXfXNgdat-B&8 zc;*bM7dluUkpuf)9cHibF_V9xV&E?J5)%&zrrI135P0o7I?)o>*pF7v9C3L>?sQTY zf@tn$cmjUy7~xld?{tN%+U&iZ?KxPS&K198bhCBfk%IYRKY5&R>)UdxA!INeG<5!3 zqMFN;Dym6d00%Dd?VzW^;G@-LdZjCW0Moc2{TfMOUm?AR>qC<)Q*RF)4&XQZM+XZ$aF`a-np`ILLR;6C$I7R!e zu@uAqaZK3hqnp+O6|~2mc3Bq~AFq|teY`JK>z>>I8vzdDL_#D8S(&f%4%6~^GGza zpGFpYJK@IG4v5&}$HY78eq-5y!WzqJ++?hYGbb8h+X!g2*;#Z9M}P%eus~|{`jX`S`hWc1 zL(?@&_F0JrXtyLVB)-JCAK`Qkfo;TR#vO(L| z^u1U^A-oZAaPD9IH}o#i&^{_~)Qx}=YBy9P-CQey6^b~0Jy(Us0hRq;vhlqC7>fb( z@QD2ct6h>3g5+@6B0X*?QnfapSZ@jD0LXxA_)c+O0Gbm20P?{r6{KYh~x-Vw7O(O`oeDX6jaxUo{ z?9mzI4wf(DMdI?_=s)C5kPLLPmKcVjzQNu}u-)-JD?@N^bpKxoUxw{j16YTgkX1Vb zal_iewNEp(wWB2|(Q}=MTt6RYLZHz~m<11bF6=?P!IIhLWDkkhalE8Q`;qnvO5Vz& z$Bf*TqvsSM>S1!#CX7;9Mz1Y26|h6U>oN^yP>!Or04Lvf^F(FiUy;tDU9W%yG6HuU z+t0{TBuqn-ruIY{E0>i`Z&T`0jfsTYJV#A@WH?->@hx@g(2YMx;B7zquBH(@E4v~P z6jwfY>(0vCYK~0At)#&8Wlu?hmBXrFi5rWNvlOH(kf5Y(Oiyv=5Z;~?MN^Nbd0g=% z8^Z@}h&6VMI+mqBp4U>SQME!eb@_oM%XmakaKj{s98LlTH8?^@PQE+ngFygY2?1x) znGk2}qW)VD4&;7c*?`CsWc0(&OF}L^O{Bm!Q{qPdR*QvhIOQngZsVe`n}E{5ezPR&FVm^6q@h)VSmKuR{z7N7uGEP8UDn1 zg=#V<_2ASB=!VTRP@GX2ijP*nHpVxm-!HKm+IU^t)u#J2Wm%w3Wf2?kC*HnG{N3m+ z>n{3_FQ>=JL~cIF=gqZ%VAbPjZnv*B_F?B;(|6og@nG=kO2iutd#%|1gc;8`Ra}%i z67?-trT?q*DY?$Rw)Dn=mT{Ky1+`XYnhR)aE_^i+fe|T>M~?w;vLsWln}B9;g*JTr z_$D;Ir152}B|8Ly7kq)<8F14&KrrQj42mjAhFkWf@AN1SZ!HppJ2X74xH(OQjnrR} zGI}VZ+`+3aFvNWQ&w;f{yTwBR32L?l#jO@S0Mwu8wc1lfgOV2_0=Vy%)oiIJBGlB| zqx|>$XWI0F_0=J;r5EY_SEf`2DU zD*%4hHONGs3a~dzmAPMk+3X9IT{bNfz$5Q%Q)p$&`Heg`V=vCjXOO-A)3)6TreppI z$GW|rb$gtjr{qZqby6-Hc)7b5w`)!#YG93UjeIiRRg3A<JVVR0Wq6Q-3s& zdZ4gSM!4Rx;Jw|IWH%I-Q!un`Mv)8gPM`NHCj!18X(da%4upt{n#M%XgCo1cptHcl z4QSO%CYmxUMbu}gpy5;;0(xKb{lyt}nUlVCIyC%c@&#n(F5YN*+!DXBeU_}vh*6G5 zEMz1+-vfn*qB`4os*$rQGVKDIm`u1pc96~5Z8L;#Axx0hv`=Sd!;;PNKDJ-b44Bd)5N$*8n| zGOD!g?;8>oZ-`7IEJ=Y*M-G^c49gObry8; zJM}=$0$+Jx)L+XfD+K(8q)hNgqc~y8}Sfj zSEXwh>RxYt-z~$&%M44A-RGuCf5K5epS=U?1=ef9^WZHf!F(qikPkB|4V*x-$ccSB zsNshK&YAwCTd@eR-!iFKo7>RFooo|~PpjG7WJ*0e2ms+6c~+BuB%#sqQ+TxjtJ7g) zniN9<$Q1vZ=pR0+WE5}4AxpJUZ=Kj{YNpg-J#VYN47ZWBO^9$d8y!XoQtxG<~BSc{FWEeSY36qEaug)$)so7G%OkkVO#25%R&X7)q zertK5k}^TADW-{F;8P}oD{L?H6ISc1q@Yna&`yGwQI>Ti&g%!mxS}x4Lv|=zGy7!F zNehrX#}q`AuzXK<33>0Si}5Et!Y2Un9ngkJXlFh-P)Ix%>w04R#nI^oKvo#(UY8qG zaYe|2?$Yef)bAIMU^6t`#M?HPNR{vuvFGx`#LO_Sukd+ZK~ZETbwsnluFzLoc}3RR z(@qlA3{r_ojxa#Oh%gdrBeU$vB|d8pAqQn4*88;6jgBF|R*L3BH?2-OXWh8mmD@Vc zt7@gEK({6#QM~egR$1|RR(lVuP1M#F%Hkdm(l0J_uleP1D|IkL5`?CemX-DRXBaE3 zhe(7K-~=AdUgg6lra%-R(s!rKFg6lRTs^Xkm3nl3U_naWI*g zAWE^e7t@){`>DVwev~z_-4^U5B0StM+s?Lw*3G%}NX}NRa*UZ}UGt_~sD!i)Wg(9& z^e~9Meyb-od-rzlrD{D}9%pP66*pnS2JU|5C^*#xt=3x%bVken^;8^ALNn;(F~_~l z?vf-JB_{_(g1?qdbtp;?%3nGr6o@pPLKzd9W3ff~LC|YIxggkdoILg1v~>8r_>Xuj z06~4SSv9eeGi?(FF#y#Y@p-pq0Nf)RS~zQ-JOrQ%>X-I?5LhSi5ar&!x#1Er$qpyr zj3KNja^>3fba7RCcO6PD?~}3IQZao+3sT9Pcq(Cs-| z3?i&AbyWUL?-#~^r_&&6g-}cK(1g1{(5q4uW_z(2EnC7Da5F*6FXoG0Y#Q)ju{&#& zst?R^Gfx13Batdjv!2Z-26cTarlD#}-&^{KQT$am%h!i}ziw=1T?63Hbk%mSeV2ef z^rQdBDZHXY?@G`;r=6@pv-JJ4Lm28&A))82`?$ z`g6V;uGZj&Xw1n!V#2nC4%M3;AuhV!b&#%vG`tNtVtk+{|LmBh%lL?|LtHT${rfCG zlJXZ7Ax;QE5}j0f7z|vG1?Ds*;na1ipJo?IhnD=~jdf)sNe+>mGu3iaxLse$Ey!S^ zwKu=tF85~Pk>~*0YcH6mv50dlbU8arLP{;<3H&mFD#nmNF-@-H{NesTDN z{q?IPtc7iqWJ~m-0h4s4@x#8kw(oV}t9w!%~sbv)|>^)ck*NH!#E9`Z@ zIx?4xlQ}%o+cFjm@+5(fT%x2h>85$@V4C5^BjE?Tv2=79Mx`kGjQbp-k_4w4ZYm$2 zWlQ`EKPK@#FlSM`?KZLcxOwAVk=2Y?SRH+Uqpr4F$?gRUbPK!i;ofHU&lIGE5_=N9 zH78%$nVds7WD*?-;Ie{A1do6lO$bxYBhHnwEIIIA?9(1>Ul`i_3C~W$mNY#!P0K7l z`R`Z(H_#DTtIZUI^xz1U=^Z|`u+sRaN401ZK&LU@Ef00093 z03+Z4000dEo?&!Cp8x;@001Z;0000D0iI)OLZ1Kt0{{RhApigY)+?x!2z>6yy&=3I5E@MZ3=J20pZH7468IEGnb40%k7G=myAEl_lRmi#-hl8 zk^8b4kF($j*Vg+1B)oXvdT+APVM{8yA+`YDBm|gL8ELt`vA$y8vcB$pw5pArs<2CAi>9@f!6zyqGrmF_~V5 zebUp><%+otnWQvSgi8Ywmm%_1<^#J5HbCc%v8l%c&yCifTxQXs=T!G|70`()P_Ll# z5Uuh(apWp{oQyp>3*8=+!RzGD`buFA^wDIYkpgHZelu^Y9Q$F_T2>><6+OrWQk5Zd zhfbI8$vO3%nDr9YCO`~?Op(iP=MU`v9+hO90(@}vq8WrOP9ACZIUYW4W3)sQO0GMX zAhUSmG-zW{gN#H&qR!xZ67@X02upDaa<}JF){rLr+Dg`Vi{7^mD?jm( zMY*O{l|E2SII;XcfuEG%Q)>=s<>`p#%$q&;V```l>2;YpN92ps-^nZh000g_o`r5v zFqZ%T0{{RfU;qFB4FR5}ZbF{`00RI3C?Nm<0+T_St2GEMsWO-c{{XW3DTA#Hj-Dn9 zMr`+uvO(_SH{N1fKrm2yDm65M5EgV3MmSf%bXgWk4&ugf9?mcEUsocc?FFZuyr}gVzds?mxE0{%>7Ov@L04NnNkH-w=7H$oVBR=nM+A^Q2%N_f02_ISydl%=gXF6 zn2~zpN-ri*jbqskP8*@_^$9?aR@EcjQOhe=NO~wjK5{_CKM;_~^v-@3a<_H^jJcv& z#+nMBs;gh%=85w+mvtA6_GkU=CeG>^>FShTciAuD!ou)ma5Obsq)4jOdyV^O`8($= z?g;4`7;{#dtJIbqcT!BNsU~ywGrILlGCyG~f~WRj7*hqC0?!WZ&i;fGLsw}%=~M(8 z(uVgo{u%J+{d0;K3VLtsmIIHDlQ8%QZaIH;pEZ8`)5!PPl#eb^?T$9(TmtB63|gSc z295o=&TE4k1@F|*odD(5YuJT~a=pQt_a->qEw)Mu`rf%=Y(-BA@}-Uyy!eJ-7EBC z;7Af1s*}ETPbqJziDK1M@vh#G`I~k@jg8pVMR0!eiiK>8ls)j4swzGnYdISSJGSm1 zsd7Rb(L&Xq3<-kxSR8is162fhnfeFR{=hOKe&iLikhaly*`4Vx zqKYik>~`u}JQK7$ce~+~vmjF!8B*w>5r8z^smiMU^kie8v0{vIB`YI2=<__wHeoNb zdm`px4vOvAqPr9|LH&&2zGn~O&?#S-=}d`^LWa(y(BL*PZu#D2&QGjt0yBHz=}jvL z6d8e~pCP&Z-N}pQH^ z>Rk|$p|!u(FsKBO8oCk^a(lus8Yhv36-YWd?xQM4Rh}NKVxQVSnkkvxkrOop`>T$~ zCQ{U_Qu)P4$bz`44rA^3Qw5N{h!d$j@SDvcp!-JL^+_Z`m&PBxD5Q-XU4EvspfeZ= zb1F*D9jCxW!FA>~TKc&XM)MAEcO$z{T~dYJUt?wfX#s1s*|shz5!B6ldldaLuJ#Cl zAZ))FQFf0nWs_+4Mk)i7x>@4lcz!!X%k0Le?GzbjND% z!XO)h9MwE@sZ2Y4@8k9=TJZIuH#-`SW^;-&wMUgMrwCJRRjte@`U&r3KwFMK;F!<= z?8G*C9zG_Z$+4&yFL@WjIwgJh+B~@3ArICN=6-Ee}3~pRISt zjwq-znCd8aSUYAK9va{8Ro*kvD@gL>J{fuqk*}cuYz-b zAJ=k}-C$wBEk8NEgB@toSB9*q;IvP0*^mIFL@QL`l2 zbZew!?0Zp#{X}3P*Q6>-e4)#1z&`b&FbErqIJHHmaleN%m!`poYaaA#QW^1kK^E(5 zMmZesrDbiMoX&n9iU9{AU^-9^sSqet55zi{$|p`Xh)Z0?OyVF>V=!6AE1u8wm(saM zMv(3ioKZL2rbh>7<4^8)vCkxah{TGq74TOkmih&fQqQKtfo-~jHiS4*$m6*`hN{~6N6ZX0xMqLGnZjYLg!^US8=%QzRuc~2ydLczO10AE7rv{CrGI(fT3gHam z0j-wrw~MF1^d+8_H(6d`GclUeR&LMt38+WCP;LAZ8@Yc&K>z>%0xXcB%uOh1G_Hpa zpl)>qc%Y`WUn_Z?|7s~W@f)}R^U~S=RzGbcK=NIaq|ezEs&5O2gC+Xl9qv`HiD*qD zV!49b+%MZ^mX0Hc_18CmMxV-Sf+R=NCkYt^mNAG$K$R1_4H zui6Ue7F7qL>1MAp3!+-5#nO2tC#cxH>%C&dl-dFRB85V{zoJvlL8^~j)*o|(WU>{asi@5sg{QK+W$E}}yl=?3 za0r5`4fy%iyGK1POrf0vZ-i81vh+;0SPB8x&S6l;=y;_%+BG5nc1LWq@(ybsxDXb` z0Myvr3wT&I;pi+48yy!Ebxwo3nZD9oovoaGurdBb!C6Qc>b{3iZ0&;z6$ypctC%jm50zmr88V8#-QA*q zrzQO1<1He~gH{G>2Xno}UzkZYQa_XQ1I~woBf*clAA-L+(-#bE0ayhZ8&*h4I%#xc z=8kFCJm!J0UH&HQtCMnMJeHkXw9NP!-|eDsSRAz&rE|N?{`nPAf;D4mkWlkT4iYp7 zX{MbHb4y)&TaZu0?bS>or#+9Ct5_?3WWFMxF!y&^!g@?2iVQU?J{*~MlW_GOobxMC zc^-;(nT?eB!oAX9dQ>3^g?ZhZpiGsIvORyMct>r*dlQ7!UCX40q#IwR2r;cvIKV5M zQ*Q)i-W=ak^pPm#U?Ou9XL>3a)O zGK!LU;=p{1u=2{lgR~;t7L}6DoN) zV&g2sLKMJDW^DhvwKQKH>4TCpzcnP;pBc~BSuE)S5}^-+$y~$59!;4x3HVW0`JIv~ z04`ZCEQBp8eGU9iX1x=@;1N;4ue#nPEXG*&hsPn8<6SZ!#GuS*bYz9r0uy`8QvB^S zptfK_ic^AK==N$kE3hh^UmbomVl$s?9fi{JbH0e56OzAT<38V%5MV*Xn^!%_81YYI z&kz(s%ExR#w@!A$(k!dHPM@j=4bl4Y0OM+SU;23KU48otOs$a|cZB)93Yn#t?xXrM zhHyhX^h&)ER@IXa$7U~|qwACZjL+$6BLxj-rNGKxx`?)W41Mr`1cLfKzIKz9a|k~z8LLi_}nfegy)44BLn0E`zo z;CSt@z(=7ifwkt=JKvkIQRaQ!t^4|Fo8;tB3n9wnCSJT=qM+eogd1S&`Cb&YbGjjuel%eafs`JI4xewS#BL~*bW z>3sSd6#Ww;qwYRIxI;HJy7T)pY(i=UDoU0WDRX;*CG&oGTpmivGGBd7z;r8>?D?;9 zz0S+i`)(lqz94#nGU75b0@^j+%r^WB3*$jii2af`X*;C<$~%Zm@1jWf-;k1_a?wiz z?eP0fNb*`>p}q;9W-r-3HD|ieA^s<;YAN$g&^l3c&)3{CUKK1b0my!b3B76p%Y}ZW zahcA7VOoWyOpzWXau_YPc=INj^$)@M)+91yiKY`PvRwHxk{Pdkcfg40?KAZ}vJkN3 z3|P)3NH*u7ZKhf^>laCkKWF2Wt?22>Mt!jHf~tkgAZ2)tm^T*a#k_JfXF1nlzc-Cu zcqv$@A|S~rgQ2MU7sr;w3CY6GR(e=%^&Wi&U*(n4ZhHyWnHtJ)H4%|IPHq6{F(p-D z!P8jxwx^KWmcAsn$7KcF#_`1CtSFX{9uB zeW+OoB9@5Zyj@t@k8eEfOzdjwGpbLY6PgN(E=M%vEKqW1aUEs600Ouwfhq^+oPJP` zAdu7YLjR>+W>fqL06tbH`4~623Wv7Dlu?_Csfm6# zwaTCu?x<{2oyPWY!Wa)q&~Afm|STrj5V9Ck+AYAY}eCqSXEwme?!AdZ3>Bt{{~?Q3NT?=C5cx)&u9wic>$ z<7*u0v&pp=SimoPq11Cliq=1jq#^0&aE2v0Yx}fVpJ?nkI??<17E)balWbLVsipD= zi9nbo&a|774m{nCc(XP$`Z9HXGcSK2AJIxj=HreQQ2b$xCxp|M;PgGZVQNC%-Db6~ zr-0)#nE7SpK6gkk>a5MDK00Tw_FM74)9Yg6($lQrU3nF*(f*PtU?AAV$z2m0%Wvy^ zo#uJPdbUMlgHN#m)DppTtuw(?0^GqX{$^vXG+-5Fd?no2TQI{?Jk{{vpo{y{ardif zu~bRsGr&+ZTPU)#ODVbrQ*(e|9viqXpl07bS&ZPB;2EW_s8%e3?#TnHBK#+v2+$Iq ztKet;B)gNx=y5OK6)A(1&Mtf_*J-uaIB_^2J?3G2zZ^q(5&!-a-}kbVez9WyRKaY3 z(nhXD0Q3k^7t68Fe54A;$#hThDj&K}N{UW8tyJ8`t#~FbP!i zw>}FVBZA6L=>QW-iT)zs8LZ5QBk$SqkMEZDw-mjRuB90aVQf`=t$W?3)Sbh`W0<< zMXYR`e~Co^006NnkdFUvrO-jQ!Ks??6kOBbtCwQakM&U;dov*7GzI@O(K^V!L!npx z7OP}Wnt3J=!mG1l*c)WE7RiX8RY?sY=>#;nzDFANgeukCnE#&VFkWt+sm}*TFtm78 z!S=m`U&%p63lP0ZfaHSZ$Z0bw)WxFU6@^XnE<0k9Ek^MZq4X;n(X6dAZ z{_??|UaGD)*$qtclt=-N_2ctHnrT zyT`g3&ds*4vFuTvqX*94;)Wo)CfBH3+GP7GZkT7LL;}|l(#!*GHrGKp-7M&<+ozuD z(mw#_c~}as@Z|}0AUO9tnW(CJK&_2&OXQHTt;;Ios17UV3+XFem zP0~FGOxK~;4pahQ`VIQXmmU)LSGLoC{9GHhAmSpcTaqp4dE|MAls@ltMFS}|AJb?* zGVjhag|RiBAiNULm~lz**GeDf^$?9?&1DkYh5D)Scg9kzQu?9#c|4Fb z3R%pjKwzD|Ni~?;3U5v=Cxg2yLMVgQ6;M;@GsUVogayVoHJvj-R6+z9*s3FN{N#jp zkSy{@evya}3SmE?3%KAFidvE7HWNjC`8NE3z}LMxuo8zZ!FR}7__79QhL$bQ!?TmT zVda%~f52BRMetW*v;%^0v2k_CdCHKhm!S0VK})i`)0~QI!yv=bw&s^hGp@-NmCe)=D>k}g&%w_oG1oS}Ga8f)0r@vV{KuQyicJ+=|{Scnv1Z#AZ$m=bHHT&PX>z@ouAo z_z4?u%%6L~RCaBEtEe*-qEtD^zvucfsXC_mf0Jx|)6-G&5^$jfbV8JQP0|qnC%^Ug-$jj`+?gu-fzlOf$ zPA$ZCcC}BW*R*CcI`3j{uH)p<^UCtI0R|Rkf94v1Ef?EIH|~3Snn|V1|CobOjAWg6 zxt~sG5D-05gfws|=^qa#GxR}wHL4_cTz`;Yh;O!C2mXD&whNKCS3|V1`x9^|D5vph zsT!`;hcK?q#*lh*Zi*3%V#8&pq%0)rkYFLzLq4PMZBOO3MH#VgE9Hx!DHc!DOByEE zTR*6=4-DiHd5CK>+ zi0bmdt<|;&xv8YI1}-oNo#_>wgrruB{a0OJmfbJ;2Vl4Q%ceoud}Zn?izj2eT1maArT0;dY308aC=3_eTf7MS!JINq+bg^a z4_!~V2y2MgF$fP{Nf4CnN3T?_v9Ae6IceVB`U4B^4Bf}_J!Z2v>u*vNXALuv;X8WI zXGqFQibCtx!IGLEA|x8X92pN1lBY z9q7o$=V5M~)JMEdw9}R_E*Y0-CQ4FU3p}%qw`UDGboBgXi)W62sW1KSm393IB{PI) ztT-5~c1Y(N#K6YYSbQx7O_s)i@$W8_$R6`5NbGQ^Cnew2@*l+^X^;{xgd5`ngwIN| zMr#brQCB2DtIr*4B0?ZNc>&WZHBZ1Hr@?}lXJJ-K;rC@hUsJgU$mXq?Fd%`95|1uP zIi#C!FEtbzs^CD|ik2CJoB+ztA{wc~fwHjaAZ+qlnSpxj82jZ3bO-`Z3^t zCj#nn6u!dDVOUf4_6B6zH!4~$2;8~daukK##izDPBXmP&b;g-hNwQ-2VZPYFu2XVF zh?lNEcOHs3tv20Mz?Q3OPbFeNyA`(|@&&&*(9U#D#%Z@8x?-CY?WfkMK_`Wv?4XBm znzRjqHP4&2LEe`QbY5K7%*X@x5%|nIir7@h8H1%^F~`NU9XjHVn@y&_Z;IrHCCN`rLwC}3B!0oBU9qtm!pIxC;$X{c7-p5o7$Jy_ zYM=K@DlZ-_atzi002}k}RmOg+Ha8_fJF<u`()H{LmPe>oYRE8Qtgnq zt11-(&*J8NS4Q{d%l?3*WjZ9gE)Lc)_K84D%q`J&j3Jpt(9MUD;YMUZx0Qa**=rKa z>lQaJ-^K^)X}BVvl|Fs_g(P-=EXJ@$ILt#Pg>n$}Gt=`lt(@-e{Rlm(md#;*d|3cK z8&F52EH;_Jp{(rRn2yLW)FDzuoP34Li&1jFT9~x&E9`oFU@7eMg^}aWIl}0q?xLV}CqY!jq)z z+q2jOvHcgh(8}EcE+zMF9OG)ti-R{kCN|pk;wAgdFN;6@6dsoCsP*(2dL9DgbCxa# zaRd$&Y90#fEPU&3Y!X7)SV3v5MJoFO@s_B-;{6twdWEtTl5qa=2bcbBVIo3((v`4X zL+N5Wl;k(G7k9 zh2aq+qC%D_OBYc)JF1p_qa&O`kUMCELsIJUS@!8owG=c0C8EK)6|!?L{IjCRK&AJk zJ}VT%fHAG-3FF0n0VyDPUCc)M1KuJ+O2CN!YWc0YSTMQXo_#YWuq$F~c& za2sx^{IUt&5L)v7K~zHel%o-B+Prd5!5{o=d>|P&FJw#vLa`0LMvY3tGXsqXuXhJY zoYE;c*tcs_8ENciqhgk!RXdnF@K`6;KS4tW1ED9TeA-%{u2wGZm_emBYby9<1O}Mk z5H`%*+S3dl(@CavI-_)0KGF8`{B5U<_ z5-LkQ6a@*O2<0`f?0E(U3J#?}2knH!f9E$1i}{^&1ydy;N`>VbtI6aks@sP`5;rP` z_jjlP%ZZwSpR!rP^-0G((KX5D{OUC#8zXD!S`kNT8u_R*mt342-uq+-C*z5l_EteV zl!Y5ufSPg{Zw^-0Jpr>sNvZ=)qhA}RRc!PR;y>}5VuM3UX~_=$pgD0r9sc(Bx#hs8 zV)0<^uLMnByvWn%uDT$%+TCslvuMN7Uz`;0;w@Pai#Jd#muk4$RD1kVQ(Pb?B2j4H`Q zdI>uGbSJ~^&O{PMw;DXf%qp}E-bbD^p{VMK}8nrk}; z@pe}Hy!_8+<7BjyZzi|miy?4Y8rOb%Q3M=JItzUHXIdFNYbwRiyx<&*HLl<&5gy^H6Yd+OMjenU{& zY)c0A8dVQ3n!vKOXLU5^q{iqQ`Mg=W+BJ;REDF@B7iI5p(I4Byl3r4}a0l`^NEYP- zA#|DYPo^(VewM~4S|UF7A?O%QdX1U{X1Ff1>Aj&J`7c0 z)O14Gl6bz1>rH!;5n>F2KTjB%%c_%_4{%Hr>GOhd3VfK&tAOIy=*y3yc^xLlyjonQ z9jIB^ocp#=+$;IHFbGJkR?Uh;GbjC%|*zI}A0Wv%Ia0lkjPt0*h6C7fVA~j2? zIP6_J3>!Rz@xEk$C&SSx1|YFX&b@DTWlyzV9O#emjWP7Hgy!3$<{GY?j7&dZ#5nEW z&uM%dF`Y3(a@@YLJk7F91$}g{fP1%vtnVs z|5UWj=i-et?Yzz(en1^4~2#8TnQs0>7n!z{tq(sN%fbD}0Rp|P}&6|o}v zJ4}zc!UBdWaZRd}Y}hxKIa1w)qd{(;M8^mX+TRf$ z=@cPZ?auw1@zt}HSZcpI3r^2@*|p!b5RP$v)|ElA5nn%+3rN8CJG=|rA|xiR@DxoL z3{YLIe|)#zy)ACQk`_4>?lIEyb893?068lPM0x1`3Apu{daXB}46mPSIKh|;BoA1%9XRVr4axfyy6Z`|)0VBa zL!Oxu5B*q;dgSqYhB-+wUjXO0t`z~~WqO0B?tUrv0awRg){oMx=n~z(oy>IHGBxT5 zV(?q9x>AR;^>gJt&y$X3tkd59O`HZHHOM0Otj&Xa_K`i4>>x6U#BY@N$HZyN{=PU> zy<3Ugf4Wh#VxfS63iePj3X=8vn;sPx{Fq!mZ{VRFYC%RQ#cYLYdl)gR3ri5Js5t2_ zP@6q`zj#QFy`T83)u&AT&Uy0`C{?Ij3Q|Bw5>HWDg}4>zW~*~@N)|vwICvWzZf1uU zqNK;hd&oofz0Z3uLW^;yiJ$RJXgWs#@aeJo__dLQBn0ho(AO@Aj%W7tFQX8=I)1>C zi2GUl(T2(ZiP>U&4C{fH{?OcV0%eA<6VpFPsF&Vy8&5abrw*YzMIv&(Hpe`oMD(^v z5q9c*^5zaMZiB!|4YpsIZK;oL07uYgpJxYmApFSg zB{~m0f65+d!O@M*y|3|k?+$w<3kM29>+MY>X-{B1_=WEcUbB5p@>Zm`Hr1lh?+oMD zoG}SA~EDum72yfwso*IC69f!f?t$uLkU)x1ZAhxIrl=8N`nRZeC<}y?WM%LNBel z661xH{PvH$6g z3HXhuYBK)Y$@YWl?&(gvlwzFa8B+)9m{-M|D(3fb*?V3wt{M#AUQ(E5*yFSBu~dU$ z`NQg`#~@mf)4Gov;>-qg)c3lQ!WOx#@QTh3U7ol`mz@GfhbdePZXv&^10+v%YfKR|(I4cHe z@)*qJqojV5&*gb8oNl?J7X*Uz5Jq+$NbLN`w&=Lh1LQgxXM@m?>KJY`9BQ5Ej?;~( zGQV?@A?-OC#LBtMJhRs<_*IC4&Pr}$mT`n|)&+?%a=Tj)?a*oJt5 ztq`7A0S4g;#OWxcG_$YPD{MBH0AHyNx*E8Sp`ULCArqOL={bNK?sY$9riw?8ci64A zRQ;2Ex@4?em2;6`WnJqEU`OeKx!zW5jCh=C=D)}vO@w&k85B1CPu>6p8Sxngs|_hgX7~md!F}*uGDVld69nmgi}KpO7eL9n-$uYSoRIADO_W)xIrLf zfOv>3awo)&=`BAdjWk{N&uC3we)yzm8$BC(cwo zckx`_EGtS4OAZ!I%bq~B@=d->4N(YB<&M7$px^J8im?pVEbsZt^OH*T=iVv_Jc+kN zJ?vW33zf-ho0Ut8+9pTvI!E}Z``sE!s4$m78zhyLW$lTe36-Yl*aH^bV6Oy!ip{=J zC?4VIHp<6+@6}kSDu;PrEjc@M-n7twV$UaTaz%z%{>F&vIpQi*%F!H+m#6`?&EjdF z9*q%;e(AG_p)L^8>yzd>vIdDWo=ueou`! z|D*iIRsUEJCZdVoH(sF_%G9PXrNhPQ_1m5|4YW?G)+lmPVD~HMw;J;|XQ<+vK2n<2 zm)~z3%971;qKh=aEp46Mha&thPI#%Ce=qM-Zi_&{yFJkzXY|EDIE5$A)?rD@DdK1b zGI6SwjiBUi2VDJIRViof7}=FBOem@$P@L+GT&|+p*t3O%KqpJbof772tYTLBOB-9p z!9BDMc0*S;J#OU9x6l zE!dimvGl}&SDT$v7AL+;GVM>VVC^8)qCgh70xina!7917yu}+1JiHVxvbghPBkeeF z>h~%L+Dc{}U2M61DV`b-<&)=jQI=iuWA_mRp-Vh67ReZio_<%mE)eyybCCi-gNj*Vp16+-tw4*lQ;trN=H@nQz**V3F-a_pT!YK>Yig>$l`T6Vy4 zoBLKg46QyWtU(50Llf9B&N@HUvqJfW*sm4m|I3fP5@O{M3r^9+(=3ByBw-VQQ?~L5(*rAERtwr}^(VmgT+#U2AKe_Gxt~ZqMr~F|PT} zy_VkOD)c;kc27@;?F;Q-H*48*=|pH~uFUI2;G9#(uFbaA)V1wR-ekE_2i@E_TA}Ej zrLXTF`ciu>NKzkdPuC7DrGs$5(FCP8*Y9Z!nlij$IOQ#3p1!~S$%Bf{oZ3u^gslXW zvUa&JGTlFe=Q#Y?Bv^tPv_W_xGX_Hr2=!9^#wa&SU^)7J7NM+4W`3wznR3YJAe2I)@Rk| z_dKz5%sW(P7&D9xSc^MF_@nIAvUc(4U$Cn{ji@!gZ36#X+av$iwGA(jKK?(~HWc|c z08}<%5mUwSnZrH$8Ds_*o?6W#z$K``{r=HapYck@q$_*<3O$})CDw*eF0Zk-=C;E# zim{^`Z7N6OxeAvwHRhp;S9j2L+L#P(O}t_OhEkr?iNYC;pom@j6{>pG7+SLln~HR( z>NLmjP^u;CvUB}hA}khutYFZ^Cr~}TBkqgl9;K2A3lIY5#!Z?P6PZ{VI&9epMI2!vOn-EHkotC^rqP6Rh20ebl|Dqnh+2?B1$Oiig%V{8q1;!< z!#1q;@1#3Ob1Gbdg@?HXJ`uPm>PKouLC0yBZNcSp#$&6}85-M6Z29lD*1UMKlGk)T zVfvkfD`?9b-(jHP#Nah3hUWxdD6@rhAp|`o4TnRhE$rjD@p- z<+U~v0^GOLjD^<_MfWw$CL7J0b{lE$qXLNoC{62$d9M^!o1_ixJU%qPYqEMoF4O*B zsq-$_vT7K_Mquv=0z6fJIKslg`J{^?5hz1)X=`^5+#&VR)#^j)Df48|trOvYww>VNWPpsys9B35$c@L zs^o>&^HHZF*;K>EeM7!hig1pu^Z@_z32#o`b@f#YZMGDs^XWi5@9uE6aP)y zD;I%;A<2c=aQfw z#gw2$KX30^Z+J1HMP?!h|F4AMu~QM-*XGJm(uQZ6c<77W4gUoOJY$Lb;ZV$qEE5Ar>Of1Z{Gi6k3TQUe9E% zz0zgwfc?%%yC!zJauNZt?XC1*5j^h^-8f($s3AGJsT9)1n)|4(MLK|_!%^*i#}bMo zPMRV7ig9Q=HNc z&1lqsN}o9q0%u4Uk`})UYNX7$?+1z((5u88WY=EnC3E~O27f;>CQb6+J<+O`W{z>? z1**?$l;k%%m=7!Nbp}qy@=t{eg3tW8@|4-H2z4~^grT%FQA%HcKK90pl`WuK8`bSP z9rFINOD0Dyskv1RoM%`*E)|)%?XmMm9q6~@?(Hsvgh27P%Dd=5`$9Cii`K&;xI@jLa__e#0BVJ-m)4j!ND1P=tHS3O+4>9Z zHkSJzVab91Z(#wn63Br2S6G16(9sobTYNq}VNCs)aO7Qc-!yAG&u|@w7YHlYX}(#b zJQuAr88-_cM-HLobI(=a?rmgy^mijj74l4@(E8Kcb~eYR1Nv&mTx(=%Ky=DLDmgvP zAL*K$7514^#A&7smQo0O@>*Da+hD+wRPBaG^!FBO4-LPaX11-b$(N4uJh;MgD=KQp zfwxEw_St6{3y(rh-5g_hbkn z!G0*~)(tmlwVTPq_FKKqBjG%E z@@-U-f$Ls~&v&dSu02pUbLqBC<)Y zm@r#NBX@}YCjzwp9RXlPfefbqw`%|GW1REnYUEmRh(R);s%}$Nl3XrKE1jnOsQU_N zlYlhma2y#~3yJ|>(; zr{stz)l3IkF zlO{Y*EY+I{%0!RUN=9O#XqMOklM0(O6N;aQAb#cakP}=_YGf6xW)XL;E9B;HwOZd+ zUzs_KxV4Ey^PyT^BT1{GZ?0aVNTf0*B{8!1H-|gICr+G

C!o(r!K#!7BH#X0nC-u3zI z$3ddeqrtCsX=W0B*~{VlO#)x%YY2+4pCLPL4rCt)gi2J!?gYX3Kz-C<>P zs+wv6B1xx-%&9tQpD#EAe@cQiGWKpyst5S3l5p0=mt&3tO$_8oATH+fXCnu?b)2V3 z-z-9$*lB(0EZNxNoOn=wv?c~>r<)SH{Nn3zB+$%Bb`^Ul3)<{Z+^mh*=dyMy+F5gK>aRC^(@C)^v@Z?QwyG0P>J7`?V z#v^}WK^}(&t+%qdR`OKxSV2ypr(EgaDQ2K>l|?i9nWnCkm0YtcUQ|4UI`Ick; z{|)Bdf3Vx|zroz}e}Oq(AfxqPrz>cyL?F0!@G`${(bz-rLRDmn6`Ll#eKl377M<5p zrWimKM%Gx9!K^uI#l~1ZC?~z+k=}(xs+QZqcSD-L{5sqx7$OM&%v$nP zfFWN*nT_1*Zf&&TE&B(3w`s4#Mmy4&kZLYRad7;?a4vJ_(n5u`_(dR_Db4J2H7Vt# zB&M*$(f}#9ta1zyDWcecHdZ9qM64u$*w69tB1wV@L;4%>0rxEp81rA()DE`+^Fs|{->O~d#KhgR?EETel1I? znymzg==h^8RuynwPEX|9tJru?IXx5-j*8uG^kEnPn$xU6pDFjJ4@tjy*ScgaBpbW6 zju9Ixm&`aq+1&auPD|6&i!Z?V$*;i2w=Q&~ny+eeD_y7Fg!B22 z%L|jvQsHgYu&4EkMP{X@IvCRS_9(pf_J)+@R*>K5Sx{R?=X&$Vn_-A?1owKF%9HU9&(%uMw z3`L$f$cxBYD?1FH1+-{D8y7<&)T2HoN`8*S_f*6fmKZORG4QKBB~EqCt)JabN@`B{ z8vSdMB9!M2_#yKn*p(R|S;?bVoNI{qi`Q9sw&K?f;qRhZc&J{zWyo8vOXy3fyt6s@ zalz3nIq8=allZUCav!z z(9oKfErs|fb(6y#8m<5wCnHS$ONb;?$DYUlr6r&^G6XGJdx-B~Eg#h559NQ<$1gMJ zPO?)!U_~-JQ)MU3))uDUO$Z1m)SgFprp3W$83n3R3__t_9js~2{nTF7_*(moG*(&< zLf+2(eq;p=equ`sqT}r@|lB z#hjJk_jfC3d`o$H;^X6`3KPlxEzKS(idXa^^W=B8pzS5wo1T6kF-iZamUf8yH(WM` z)X-K99v3qoHIcfIbQI=49fzFPf1^V1V;tiX$oTj_HXeyJ`|1%j-&*LbKb{UoHTmOu z!0C8{zkL@Z+Zwxx4My1LJ+}$wIGl(N9jvEd>)~LpN)G9(vSF16^g&#dxnGBaQ#?u{ zHW_sFe&KV7HO{XO<`EV5U_fRDc$^lOi15GP0|8l$W8oQ;^*ki+*FQ`4f&m?g)&o;q znV28W$v?Tj>^e(~9q!jA`+HBy73xw(tXfmLh@YJoCS5*4bxoNa;H%`Yd}zLg-Ko-# zSu;6Nwe}tre|8^YrB}Ikk2JIeQXG(m*k9v@rp2!S!7u2?)vV0uhCeF?e{gU?PrYq+ zcElMA41e0TT;>m$p{W|A+zIGX6W!UbYWug)JfcTru-&YWPbzV;P6g2om3b=E(*n2# zDEHcnM#7V;L{Ctju<0+CdA9IZ>EzwO4Rq{xWC_16X2xPRml^4Pz;YOCdYy#HR6P#) zemzD7cVoNJWT+eo32##rC(eNd>gn%ox5pVkcY65R=`=JtcubfPpYrp5Ld{mgGnhP^ zt10Hocq1cblI$bQ;bvv9>Na@?Q`H4toFgu`RU8EtZz8+=o*#sHfCPnURRr-5>2`rH z1WzP+q(&Zr`eP%p^x%^PoMU;91WOff&^4Ez{7$o#u|>S3cB_9Y0X*z)J-hV4lx3+B z`@m?A2q!QibR`O5R$Kz-7WJ%?qzmh4cv8}@BCRGA%RdNg5*E%0gitnw0i67JREDV3 zMa-|St`VzoLf$gzV@^nd&hI=3Lj16(l;-ZQ4+d;q9G`KM1+Dbs zQ7};l8)u&mv|n3=dO|wnea(s4TS`BBwlL;`t(;Ia$fhU1t%jH6z48m}uM!g}ormlp zmDqS4cp3{02_!Bv>Td{p#AuK4o6-@~d^(y=Qop;4M4QBB{I0O3yN4QtWU3Yo2AQnC z?G%(%<14SjIw+j)^o3a+ni2>)?^x_{G}}19s3&Vof!57ms7H|u=W=6<@~GyyL*Y|k zFT6{YZw%r~=QKn(Sx`tm5Y8}^6HC>I%!V>ZFbOGK`t=s_g#-83m6~m4NQZ-g)mJApR2Ll8F9;cw=asa$6_>0ZWNfyY*CtoE z0^62ZjFuk}!$}lri$W}|PsrHWBL>L}mz`UQ+D-wmbD*A1CU0F4)fYV;7=k?S8Cy6x zCSA8G$w$vMecKf|80tg&Z+T+hgWMYqW`0v{?lB2W?_;Qd>WJd`n;Kop-%sf2A`@g0 z5Y-L5w=$O13i-6&l+8XxKPHpU>v>C0D&3Wh6fE2ri31*qSF}m8GZiX(XjNpS&m}3O zzLiMsRZpu?*=*nx0VdZp#Va#pF` zws#uVdWKxlw0vukQH4cU8j*4%xkS@dCE;?;g$%^4cQ2hmY`u!Hn7AWdhNy+vHkCgR z73r|en$|F8(L-Rb4CuUjdarxbJ!mK7kd~DPTL#Pkl9jeHmp53HS0@G3v38mWiCQ=Z zxE?g=3p(~eY?sykL7A4@yQ`F2pQ@{uFAp=|&&;?Z7J7pHRX<1Anl$XfqO~+MmWf7q z5olO*kDBk-H#A;)V&Xv&&+t@Y|7Co?fDzvR-F3M8-v+7upDW|9LH<_>f$smdP#_e6 z41xc7{{TQPai&OCN{+IDVjsZ&3)=-dO^r^|^oRrpz!WuxM(ruhT0a)^(^?-}v1LAl zNn^2mCF#AB-A6Brad~p=KiTnz0>3{KK6b+j3Ccc7hQtLo{zABRFFTQ{>J@vcjzh1{ z29Pn$!7Z-oWCy9tMp=7w(V^68&$S3tEtz+q4yRBM3tTwsR1>WGXo{L{?+OSd0Eds_ z|MPC4ak!R|ojQ|Ffiq!TNnEr=B5lmD+(H#k2zcX9Xsf`zE0K4uxu-a5>^vQ)=vYDN z2Qr~x+#z`iQhWESAIGcbFafbaLw5n$PwhQ}%rNCo@e&-H!Qq75H^OAQcAbh?`^}iF z3-N580C%yR9HcS$FU8;xEQ&3xxg7`yXA3q00%A3<{S0=Ved|f&z%Jx^p>_d)m3jld zGD;ZK!I{~iym5ck%5j57eYl&K>BLBy$2Wl_e5nWBZAJ3=gzMz7Cns3V%}GVPW9d~< z-Gy)rf_`b4H|R)}u_B5e?N(Bio@}+t1c_G(!m(Og^mpc!8km* zB*{d&B)z3Ko@rcN1PF~f7ctZZ`Z5YsN!)D-JErpr%bK7ci7_YDBRr8V8GSyn zuMzC8?^UG0taq8;PSws5F~go#z|i-IEa_=qntN=<|DbUw4%rTtVWah9-Y0ps3abEm z9zph)&!SdL-<8g7S|4rX3*{Dl#O$Lxnx#5H^GJMm&d(&-qob$UVU6j0D@i*%jA>ym z{5u9E@7qQ6p;}6Y4(rM?i3T~jl{E2>3Q8I9$7h4;HQD&2P|A+DA_v>E|A?QTP^l>A zGzL_3^=fhh&@eK}RPz5lLO_^z<^&(l5B1!F!tjU9o7Y~4KYeGDu(t+NrsYH|1BLGn zdns|kL8s3@e-SXrnpEV7{pdX4DM>+HDSq@bs1^<^=MH=_CP;_2N^1||j;RJ{=qBmO z%xa2BEPHruMTmwG>Iqj;-c>Ye(EM>VH6C>9wH;zx=t>U_ z=Bc`4vJgteZ@GYD1OyTg9H?{{usuB?WYQ}dBS`=P9zXm?Guml$MPetxnDccf`Tm;(Samk7(ge!%M80atrnO~VAa2EeI8UkXPcL_33meI96h$q~|^56BhWu6=&*wG$pO$AU$Ev4`0V^+~Q?_TK|Q; zVO`yRqJ6&m3Os4dFn35yzx|^5^CA!biGK;Dhk~lU&;Q(OCZK3uS3jlZ|WA8uM7mx zHn{1alUHfsul}+KLkQC~ufOtxlk;6gkxx9-4U;081Eb^`lUuw26r@v*ZMsZ5tzfzFu{p7E(f;ydGr+|&^EYR^Y zg_jz0tzlC=0a&eE(wSdn59prYHTzzK8NV@qn*IW@j=X?N5@k&Hw-^Qu?TKt(XnJAj zF`{0c#C6^JHstvyep>eOap3Wt3XT!qz2JQ0l6BB2=^rNyC*{V9CL0#C+jJ%%E{5uzKhV-k`lslrTO5Zh`!TP6!&1-V3Vx=9>`*SG3(qW z22am6>M-RcQL{iQFEm60x%74sWF5lM6z*$w8LzL&2D>JmSw9;@3!P5g_jy@9Ext&c zcJ)`v7z)TGLI$kloV%3(;Uq7G_o|7@Djp_-9yy4~xVk^uSN`NUJqn(Vj-f?JmW~)s zo*C){jgj~07914svye3bh?_wNGnvd^J5cj+oHObTUvY_9fQF)(Al%?!!wdA&)C~b_ zyr_~;YL{ffec!%*brZZkMrDx(mGM_akOgv~|Gy@999u1@K)+11z z{+#pASP<5zwUfPN(v%yh@%2aX{&3hfDLpAYQ+!U3PaP&+q(h6_@0Jyh%ON?mlV_Gu zytuxrr2cFb!-E=~YxEL(uj%OwEMN=b<=fRe3x5QQ25Vd&aHEK@|iS z^-djT;;NGCn$V`b2?Iq%hymI6!hhtS0l!{k$otQFqN{4}xo%a%&0DeD8)#1d)KM?B zuG(&F$lJX>Pvf$%WY~Eu3uZ&4j*Si2UtF138g>o zcY9LMf+C(mQ3Ve&B*>dlBYagY_{!tk|KDPg`Hxh7#RTXskg53J$uQI(06`&2I^uBV zi@4_O^}nKr`V7A9V6YMPr_E)n+~5s?tmGW@nhf#iNO+^WkVaN0g`9N) zCCG#$e4yjn@tr;!BCx`6o#C=0EKd|^Pk$I+-imv0!k?8p&V*f)cZgr$p*X^ZC#GSgNe4oJcIj^V2RR|Lw#nR-QKtIR%95-+waKDnxXg+#+($0eO(aD} zUg-r3!-kW~pIxeSN{6hkW}`-w-ZzTT(BqMCmq3Xqq_W|IZbG!_J+Ey})l7$*W1o|k zpePnTF_IkA7UU?}%1@6jX#-Bs+sTNUx4xKJ8&_Fuf9W$Wa9F^y89Q*>pFUa5=kuPi z!^V2WqgzD+DKdc`t!dK44lLJ#xqyT>OVCp${xc^{p^{CdgEZa(kOu*$hUL9~TzfjQM}1@rx|7 z|NkS4;{IzSd1Ts_qJC=j`0szn>57{0@i$MCyoQRH5iEg+`6QeD({yhyL znK~#w4r(b6_p`}SEjbsEA4)|^ja&a*EFdXS^_ur~Z}P`}3zGAq*_1!>BJrK8W#RF# z(72Ek+Tqj#9vdA7T?+6))ODA^wJTsXlekXX)?*{KH4sRSFsEWX>dak4;M)YJ83V7p zzeB~SP8%P!PR=y9`qP!AZU2jyDlqtg4hdXN@u^%O1D{``oOX?a2PyU0LDziF1`Fml zeH#kIk_U<3`gAQK{`5O$b&aRm1)P#bM1fJ zmpdN-0G?H%140Qmy1%i=IyKYh2p@d;pKfYG>hgx|jtiV`pnsxif{84A zv~U$W_a`c7+nd&pg|=VwJGUH7to##1LBPrvuo&J>#Q#`=d_aZR^>eSUap z|KfoshmD{fPVVdd8QeI9)nG0cCFEr;6^O+&c*0^IHj9<`JqR!%#8y1!yba};Hh3A; zkh9m^H~5=_584m@GYH7Jwu@|@O(z_Xu9N?7(UXua z0KQ$ga4?R6<$RFY+Ch(=%D3P#!}0n%%3*s!wOASC&Bykg{g3Y75*LD6f`-6SeLn+W ze+c+0f94jeU=DXdqIfIBq^H( z^d&8kkR^Tpq+X=ng#{*R1S<@a=71_{f;Q{)I;id4MiMyT@)G4ia#l2^efX+Vw7QfF ze7OskZp-}}@pu&zmC@reO2vDGVZBE(0 zNalZHw8z8CdIC)Zzic|;xWLTAkMtb>*f&7Q1tee=$_ye7B07gCu#L{w=r{YH4r6ns z6X-g3h@_`^{HPTPpu_>={qg*KskY>)$l>fUAeu3~$?hnD+F@MBDVSnYjoF4hmtj}v zAC`4w2k*CLF%|u(!1`jC!buNh2$+H!V%ubUG!n75+jh4axGD$!Kjz8>aLcyF!*bU}7Kg!1FjC(Ck9*usPFpc>|E zf3Qa}UQ3E>5_;VCTj8G>>!V@*2o+!&@O^(rEHW_0W>ZR5UIdAMszU*{02D7dXE@PN zjBz|>{bleT)HB@QSoA8A`ByyEus%n%UD+V@mG%?-7F%m}=XKm6HgDS95QHK@@BgvP zmcLAn>LPKpy3^MjbY2a_JG^I%Sq6M|sUocD>W}fxfCR3OxTC;q#qgv}k&y#6sc`T6 zH7j?usOH3qY$6+A;Zk(eOUF!^L@P7?CQvT2F!<&hmMGU)4-cnwas-k&Fz;_hRKNe$ z9P69VWbRHz?LGEX4eyv>W~;ao(~wLo?mb6OKd6Ga=6im%We|tDz3%`*p1xv$#lF@U zI9cjXvismM>9T3wjl#ynVV^uaWt&A=;Wj;UWeCH(M%f&LXB|OMH>2cSRf`GfL@6L6 znUpfr{naSM3*@JXhNZl2kZlpGH<~sA^`}jL0T;WVr3dNdiIUz;fJ48&20eWg0O1!$ zSw|A=XCMff2QkfjN_?qFds%{GjqNdoIZ9j&N-G+SPOLyUS+~tc#0Z8>tg&?sE@n#JQc)>C{C@}(!(`VKggLiv4>h{fQ0AG-1$$l2qrH}-vA-m zAkW&lZ~alnpX#vnR8tbu46YKlBZ4 zOcJu~b9PpyEy4^3sVnz}ShI0>zr&Z)BWW9Os z`m7qs4DK=-nP`zL*jeyQK0GM?vD$W?{xS(xN&N}#f?wXeeP?CMasD{{>udGRbDso!Ri zsM{N^OO`YI5xRNlCf36fSyW_a54lvUkBqedsK1sFSGLI6t}!+!zJFY_LRB1Atm&Z| zJ(D#Q{~hF1K%lgX_c~|=i#-D8PLkt>R&LMjl@d@>o}}8OoRDFiOda;!dl--(z_OSx zQrR5ipB%FakZCvk#=I;)NP@=(!UaDqvNt#ytqNnnNKtIAakNp$na>%tKMtet+#3Hz zJ!0KF-k0ML9#d;kH?x9hFSSZq+_*ASaWB;Tvf^wDNq{qg{(%LdrK0hjFHv!&Jxy*r zllB@(3!nZ?u%hFFA#_)F~YcC4ay#x$8Gi4wBaX81H(^}C^- zT*<G@$N<*TY8F!n>}P9>wYTF8*6MeaBpdkns_rSt>QP(O#4&aPf+mL z#R8BHd?TZQL&|_Y!u4gG-zl2cSUa1>IGx^7rAx{7NCN{IHE8a&VzsiqBT8At$AtVT z)_#0eX{l~i|3cN+Bvv4i1oeuU>@|gcs2K#g^ryACyYz=M^9F8fyA42UXW{39(-7|rE z$nV{OXm7*abo6TxDmuN{v+)0@p&Cc6@?d!t&ucKW?JN*OPERXa2m)1Z=NiZzQ#m-n zHE&c2oN!YFSTMUE^WP2-!QQzg(N5*Imc~i>`!gIvNUxjBKkP|oRT=uvhv>nmG_4f1yGGrAa#nG;yI*Hy#iYVY-%SwO@9)nre+mbd@os}TQ5F5gms%{eVq;gagJKgo|4_u}tKwpXA0&k?luMITjadd}BbSSP&hjQ2y)XHkaIQ8yFEmPcjaDzb!-}>cN}^ zrbdDqjA5Axf`xVvsbRopGRyu-aV{-$8dla$GZc>W`nc-LJ-G1J<<-AWBBL1&d@!E) zcamCd3>uZYO$bP~?`P_tyX-J1U1lPnP3-tS=#OaH$w!^+t_;J>8~kKouDh!R3Qr-N zx8Bq<`S$F6<;*W|i>r1RD3?$@L2?IyYosC%F?V2k&4$XDQ21SlRKG_SBafZ9md!UG z(RIr6;iM@AVvMad@Kz01_na8rV=JM9Jafg`Ep^f4S^Rm)e#7c=ivl3SqK_d%%9*J5 z)$BDA@3kI|NR4>&wi#zEaeS3SvD;D9W<>v|QFZc|hc=n`=mF05k7jfZziLkrZSy+n z(vQ&D?rdY=q5V@PGXlv6MDYGI{}I?MuxMSi(4yx={l?O-@#ooDgxv}4 z2(;{`jGr=W0A5kp%ZaUM)QmtPU68v4f1q0zv84L17M)Z}SR%F7DJ?r8`)$L#!?wIq z&Zzhr+*CiEld7@l5JpimL`yAA9%eCu5IjoF0$Pm)b(uOH2FtVL{{Xpa09cbR(OCT3 z#9-=@4nKxa8M(xKV)eC&^EU)&Sb%J7F1r$!etaUd=EwPWJlPCGH3K)`;c=1;gm9) z6;yH+c0B3oH?Z%h7^k6k(*re?Qrnmk{BSjRCe$gwX!b!W0I_ z2!8vfUN3s=R)MXuv{ELy<0|o>3DBkQ>t60-Y^hEaVJV=c>(m5KfL}l$m7EIrlfq#* z@*d_$jBh%j6sbYEMYY4+0dFcBh*_4Ou*b51ny-cVml$#rJOH6;-cUhGx!`WIo$ZuB z!jjfKJ*)MdE-D%{zMM92WbI+iAEVL?P-VaJGQNFsxNw4giIO-WfcmrtMHa-ps@)ne z(=O4SqewpScm?*e0$9bSsO58k(qw2V!vOM0ku<=91)RoQ4CoOV?bflh}cy9;d zZc13<(n^aS(c6ILc5kfG=VFWgWk2VL2#n$ftwfzhfq9oa{qecgkx0fp#N%M^k={c$ zsl&_m5z~}tO5PnT(d{ud<7^TD_k8SaSrv_en*HPB_GHH0`u3hwvuZG1u9fvwWX7v< z6$oLI=5Wpf!9M~r((AG`KLgUugAw7v3E^jjgeiPYfjaDy9y-1^5ydj79U9@g`ha&$ zX=am&lld^;J;AVJT&^^NzU&a0QK=P=Sx#R3owlGp8{2Kuy)URxTa0{+Bwd~* zQ!n*QZ4FZ02!w*WDl&J8FDcoJg93U2K)naOpiQhEQBxyMJ+2d=vjkwla^j7g_Ny|i z8uL(F)3tNHwg4}qtBktJDDVY@)8@o+96K!CZiWTq_h9{<5?9^ygP?!@U3r$Jva-dEj~O&;0^;Sn?3je;L(l1qaFtbmkm?0i zk;{Ai!rYgWXq$z1Z=T}WSh%jj zNs$KP*r{bCqn~Wvmy^O_VWg-EJ77gfM=+XS(<#If)F~#9{Ql4#wKy^{CGajIcY#%V zBwJAMePZz)HW44cj|w^AsVhUr5y1&43|^wPPM_IW`QXiVlI#VUf$TQGbV8!)r_>~T<#pI3l_(a)&R-tH=ikLC+!jNU zKVySju8cmc z8O6a8+K8ETTgP}llV4H0za%^lZiTKoOU;0-kREVk`)_eV4|l(6jb@f9`7QJ*_ER__ z;<5JO3xkjQlnv}*t~Ww%x+o-8`j$2r=ioRFpwr*9_`W^oaU6)3wqROr)*OY$1B2>w zNJkyT1GaO!6OUXey4nGaINT+@avTZM0e3&0@XJRulqt{}1@@!f;)eSH7j=D)6y1%l zb3a<~G<(9#5AkwSkB>WXi1f2m2o&c);t8IO!Aimi=)$>XdCOe*0O^-s8@!TVO@{Vo zZ2%GN1#8AfxUlaw^o(w_5|f})#7JkmWsmPv}nGIkWwl6OU_uDHGxKIWw-0{P13Cu{q zbQ>hPTgtMLcYI|;T`e%fe)|+=1-&LsXxeVTk3!*8=n3xaqBui3ZL6_$sxE!|2&m@? zaLL!V(B{>Adi zf^wS@{ylFUYR9YHr=lk%MjbX{I|xtNPPS;`m9IXWC{duf@>dU(E|hK$FrGaop_yJT zm!Ux9q%jT4Y} zc3(;G<#H)%=AxVs5ntt!c3dI|D>=(zEO^=7XNcx*gO1RH5xl%a7OYDnh7R{{%_&q6 z`Qsw8rcth4s(Gx~U>ZFVQ}ILqIyoZ@qCLg{1#Yj8hm$)$V{qE13DXVWTaBY7C~og> zBVIgj`@1w&gKVHGpwKU^0^yR@9KgE5{c)c_hrMQx^vR$i#&qwzw~yu;Zh1V>#dAxC zIDaBZo5zCTz8u~tNj`?U#Cig_u$6*%wX&R8 zDuxvf8-Qw_w%D@{A8XeWMOgNUW8ZL>TH_$`Ef%GIU#~thFlAC?+toNR%5}x8=%$XH zvzvv%o+C~tTd}>gC*U-ouIjzj1a7PM zN^Ih_@%KIPQoBQ*xP^VCQE3g7!iAa<++h-e8OnJx?0g>BkvbcH5sAM|^TianAX`?L zd`kN$PIxil#D@Rhk+p`M2W-70y-=IVyd7zC!cb8vchF7z)$)GN2uo9`dl+(zGRWT< zLANgh0o(GHyx9}NL@k+lW2*DaU82Aqao*kr%QHF+8fv*LOR1dUwT(*oyo(vys9ZmB z!aa&~64byhltTmN+7cGV@%^cdf~no)PN*M%2QB$KYr5YZUY0AU?brQh-SrVHF=j%T3PrY~za_Hn4p;0xFoQH22- z!dqLmf-jw>vLtX0IYot=cu(B+9V)ovJpRYb;>F>iPK92`t^mzzpDCL$_8TWQo|waf z%r#9uae}AGL}_>#W6tMKuDE21!-EIY+=_I!LszywrXNc8NZtf(U9()TDB8TE*7ABd zv$;2D?&iiZk6C;`i=7hz@zQiU38ebVVFM%IBO?{!tJ;6n#{#KtxLAiIFb?)WqVq|U zt^+diN#Dasn=!)m&dVlyi&+^(ya_c$g!L%cENJkIfCw0Q-~-R!``y#X26ZbBI>}F( zkFz84m#u?)yDIU!*qH}Xj9X3JOS~M?uQaPJY2f{-oXC(pA`3T=o#&zP*qbp1-J3yS z4ixj|MU{n`Xmc%xszgJ$n1-f@hTLn@XuCSwen7DKwAknqRVTGQX%+t}r}Vionyv{c`hb6>zEdW>Y36wPGq5kZ(I8~GIP z-$}8KExQ_ll}a0M%2rAdSTXox^D>4t#TqWrAGU%|Mmlsde;Gx#RRn_b%P|M#p}Ely=ASVJh7-F18-{W&)thD+A#IQwKZ4`#m&s zU^giKQXLdSxG0wXc+&mTcr`Z8W9Vz}(m>m62w_$DL5@Gz8Tr(c@)z*T`4l-nfRPdS zKzBp9owBk!(}&C&r59TpghqMhBmX+g&S-1qN|B+~l%^-;+9)&*QIiWTUN?qnN)D%x z@q`9l&RTT-SGtqd09o2@b2lb5@zPL6_8?e9SxVNj9T|ag5DZzeM|)J}eoA}9ui_Q6 zM#uVQ`P4pj_)@ZNu3&5MGJx@CN^ZoZ&DNYcCs&iKij9Ko{=?blvg-Mm@YgO!gq z%+(Rr?Od1)0H%DED!K*d}iAHGgES%iwI3L&HI>^P@ z)Y|D@<8U$Rdt76$5j+W3Kd?DqQM#PQeQbDAOKgzC?moX-CHI`5VhN@iUCLDq;UybW z@5=@V%_d-eu;)~7RJQMX#3;if55hS-z3N@yC}LF)?nK9YqSlbtx{ulk-4!?I>l7&~ z(kiUyIvTOGf}zT(y|#(fONd!(C4ZX@69WJ_xxbmOdagJ74Z`msm2erd5ZxLWZp|;u z@n(~1EXo;Fa~5OU`)o6H^D)@uAF&`3yY5ANh}~aEMpyMLAyM1(M&Khu<_KIN? zZOMNcuJ5;F-yV0q#l=N4!(so+icxr{Qomr74Q0-h+EC(`pVfzqF&tqL;33xXc)w?Q zBx-%@dngO)uwsZ_y+Ey)_;)B?QgnR_S`pH6G818BQ+!US7F0X8LWWG4JycSI{-PAMZgHaz#xx0&B;Kl4cD-y#d4?Xjp!d=sl%PMVf>?1 z8FSdfp1sj(>h>v?vJ;ns?ZW3K!=DS-48>|z__w1j=p5SaYo0A!DFr1V)Ta6FrtIJz zelY8xXy8f>Lvk`uhDnUW;blXvsY*0a7VuE9o=_cTvWEgaJ|yVH=hjLq-qz{eeP+V= zG19OfFX%JW-pYLF4IUY=?jJ9JL>Uo@c!sSz$?Ay7rC^FPz&bwH?hL}vNk>0?QCZ2$ z12*x;l&e+K-O{Sg(PM@9o_s6M?_auck9dDPR#FMJ4X)<&|8XPKlRRf)MgTs zeiNrK@(;?={9=5ma-Ke`!!9eOiP=*zfv!QUJ1x); z?)oxJ^<P(4!&5-8oc|!BI;ls1r^B z2Vgi5+$R}dc7!xIYaII$nqm`}ty-ut=UIS1b0r_v|a~gwZKHInU2I^jTEu@My;JBYL>*E2p2Wr09jb7l!4WI0 z9YqDF-2|uZTv~R0B;(xxvS}t}i@to5129e7KtOFk!zrnUn2gEDaV+z)|IW0#5o!rb z>Nw=Z^$9d{MG$KF&G-B0$l$P00L|X$+i^2DH5!; zbdalyBbs5fiWZ@ITY~5A>66Z~JPO#D{!Z43w{|VsaexB*mE~9+$F!wQ!dGuCkN7wY zBh)jVRbzecljziYY^%Khj@bH*EmKoFF5lLd6Xycgy~y(v5Pt2LK-v#!t*UBKbgEVc zvG6hq*I|#&Q>?yF_}Ijs4$ul;AjobtA(EzfID-M|QB(d~U>Tt6Us9+;#9#rI!oT%4 z3tXR$xmC;ID4Fv&jKDLX!%!6#vY)sM;&I8odH%z37_GOp9?E=9)J7W`0(t>l6C&8?)oalx)TrbmKJcga%1EwB0^K zROsyVU65qq?b8_PLNMGSnYN}@R4Kt#y`KRi%QzbLMK7FCp(8MapGVtdSoFu)(q$&G zX-m@CJ~uch3pnJT7SPB)og>D^IKi*@$t-6;*JE&sr`H#TYld*}w5u(ijds4G$5T?~ zX(|DKIKE3%u`QAQt!Jhga-EKMgz;vNvVe^7INyj;bZOBidKU>SsLZ*>R)KSnaL1kN z*l)^nHILV=*TrZw>>H2jjAa)HVUG(NA0!52XD(TMijziK6!uXJdnpQOZFpgUlNd4#oK&CnYSk`5H0R+s;faXX_y?Kl&Qx8UuU z0-6iaudAkrKQNX+d~BrBo|L5IEz)a{CCw%kn-kkvKI^gvZg5p}TKH|fkfC#I2uWOe zEZNF6-SF%I-IW&_Oumzp7WN;WHHxb5f_Sxr4l946^h|}e!UG(7_hjV)tn~e+dJ169 zxm^OK7ZT*ehiyjW|HWGRtS>(+!~Qu+&hIEA?Vn=Yh{Y zf*>n0E#;hd8R#EGch`KKz74t1fRQ6fG#|{3_7~!eq6<&?L7zM{M50<{sA;(X2Xy+UJ_$-AtJm&p(|>Bq&Wv(31oJH)%yHg(*2PC zn{Rl=cu=)*?Gy-$apjRYJH*`!|dRw=w*y+TGQF%0Fz zduzJ!W|@L#ZTpsPBsy9jOR@7~x4S}tlfiu5AojY;dECb040Y0=YsimSz%7~toUjl# zvQwVp?HYMeYhczpQ56^o^dn*0Tq3Ul{6WBsW4DHDni@73l$J6THz!;7fhj~ZEVhLS z3HEc?(vj}60thD6k3o8mrn$@?S?! z^B)90eJ{|Z$sD?`taxf#4Ju?ec$Nk;q2PA>o)6g)Xf!Mhuk+R=eM8kwt!u9_;yEQr z0TY=D<-sUT@&elu0_Q)3qmeb^VZx@ADP%aG z?5J;s8X`;4 z()WtA5+$E{k<|lZyiIGF>oDAZ5}!^z^3)pSoKD1mU~Ojn*!bxGab-Lo-n50}&3Hn# zE_MoNrd0wiPd%jTt6Q2P2$8=v=|8T?FFE}u6Ol^7K^0*Uk3LJC+?Z3Ka^&uX>h^T6 zsuVb8HvsT8$BkuDDk;);Yzt-s!UScMe~W&F)_!m0%?)x&L99N^OKYS?`m;5gIF5zL z)BW$)?3{Y$a;#?{fsi|zw&;sV1QE;qj1eHXV~i_=CHs0Tk3R{ckIunyqKo^dj1bc1zbWY zqn!O*if&bu^7*~R$42GvXW@sD$*4|JPNAiYkpm#4oYP50aaPgZU95UmUx`I-l?u@9 z5h`Rfwr)H*`CP*ijK3pQJ6PlLKjKy3i{Hx$Ax5^V5K5^}qD}zga6M+WeO*{1DEsVU+ zye8MU?P{yOxz+vrbM}tI+!Jy$8cR&=`E+bUCEb(8oTCB)g9*qu85kI0_)H#KKxc<- zj$?7u6jtsl(brzCUR=}-#n8DG9&;!v8>rrbN;LR1-8`XH6tgjR;nCDae*Y{Qk`D^Z zxCJ3iXnG zNJu+$DoSYSMh@FeczXFi3zTWGr;ik5;5nFpEk?SuNG~5KhIv01#?nq0DGS|%QU856 z>!Ik6)d$SFHuEr}dmivt?ddfhIV@u!+-=PEHk4L-r$cQuO(kv|-pv7?xQS>dL$M@yC%7 z?e8Gd3{$X~%=*sYqyE?`deK@j8{GW#JGF67Y72(73O1AZUPS-CWX+z9r#vqp#O&yT ziKzq=jb=borhz57x8aHl@81l{*?GO3=L@x3m@M)MVc+5@4B}H4miIeE`#lDzLgZNLiT^MCFintuSYq57+eTzlz~2+W*EMr-H&C4&@z49h8J1xbUhG zN({hbD4zv83$Wx^V8xU7(mO^-JCeKTKzd=D48a*pz{bIi`QWXTv9CGQ(HTJ34!c(LiBM6<#<`*qIiRf^rimP*&9q|+R%ue^_tTA67TFbzgD57O)KUeSm zpNKabVjn<4 zECS$%RxGJTI8{QA@RsR9vZ|DBf!6u_gQwhzdEO;BObKjN_M9P~*r(tgAz`T5IZAsT z%+n$#@GCs(_HBp&TBmCuE(7Hkf;Qqh@{1!!1sM?Mh*S_DiAh#YfL(15NCG?sT#iN( zNFWSw$^$gY6o!I?3<&xvyBD?)M3A=c=v={7=HkmOwlco!HR_fO8KYVt9^h-l+lgEr zUO%5Q}_7I#h1}--TTmN=R_Rz*e4#Z7WJcQm0JEtCU zP?W)lwivI1PlZEM)q)xu7#2LMroHzKy_a-U!uGfBh8MWb{^b*&H9X5xZjrCHD_eo^ z^MTWFX`dqay_4CGo2x#bUm*W5DdSpu&;z%s&dZ-(fwIQ!R(Dv>{ibAaUj=1JI!{yS zAz4P~51s%655%Xzp)9J$HXg+x!Ep74H7O8;B+IRdK!)=iQFG#sI*Wteun8&ybP5`TWDS;r3I(J)W2wq zlIsWN;u7W-}>bXS0# zj&0w-`6O|eF)eS*C71!&GZDE~sbiVCNhLh218=J7AFwfqnJ?l-263bx2u?-^&U2K+ zlWE?<*f#xE_z2YvrSj-s=(pIf@I^)~%c`Voi)Pg!6LFWc-C;S$5~x9}gw<8?q2D{G zP4N5ULrgJHGvbKjSqxOGx<0a$X8HF#o$?4*(d~J^)Yd-s`lyX4FJzlk_H5G6*Y4+C z-X~!O;5!`^4@=(~$&>!gT!1bFGS(}bz5;890Luc?oEEV-r?#8+MqBV61{q6|4HC@a zxB%x9haS3r)RYI)aPuij0}}bUxuL_Ph>qgWpdb3oO08BZuAv|5ATSxV(2uD+s{I|Z za+FiTaB)1~6Y01eqlSu^w~i9~Ac!~Ov|iTUm%%1D{d1s}8Hajs=WAH1J)3U1ht8|Q zx!eOO6!Y^*pj`C{T;VOb0xj^8+=9$4cp99=Yz|MEkGoDmYOJJ+>XgOC;Mk-C$^SDP zm|+kh!&c>DDq?$&QG3~7D<=uQZVPk1KIB9w%x=iJ2wYF zVzZ1o=gjgUsC@~Yh{*E)yA~$(9yq(Qi44jc6>3Kt{WKo)$K7w`x~-K1bmsrop($Oe zU?UKayBUHWa`?E$OAhYhUt>J>R5F|l?Ue^a)MPdLYzUJ(YIKx5W?!S%o5m1>qr&4R z0iqJ8I9V6}M33qMh#2$&3KX~1{7mXZ`8YQqC`}71k2!t=wn~f*z2^pr!G8R0GSR-O zM@B_0f=_U0Jlu@mdWLV-eGbW`hgJ3{=aKFQR|MHp*zfim7|U4&?w<)kx8& zNe|VrsWfWJP4vBmh#Vm2Ek)8}&c%(rZ{N`p(a;|?q9s}O^{-tqg0wXc1wMiVaKndE3%{Uh;A zZUeX=1Iv3!)3&>4Vi3i=5A9qUBS)=!kQ=naqCf*!oWLP2 zclvcD6Dc(Uh~8o^3#G);8I%o$i?714Gn<(1bql0Z87kQwDhLcyqE@R=sQAQR56wBg zz0vc2!eKRgzDwKP_xM3x9J4vqbQv@j6UH|xeYVZW)swEKLmY&qUq5QP(iVA*Voq($ zXBnbrZIJox|LQ0O^cf;j1BXBR_`7Air>x-6I8K&;8odojp&``FaK~A5Hgj$4Wp&=% zSSWuxfR0;V9*2@q0Ge|Rv01x=lnR!D9K99;rdWv)AT^4A+2i&{dTr_BJ^{?1LF=~c zc{OLS?Tz*F|FtzoV2`FhGFoLWioGL3BEvnXgd{inxrd)zol((fT$f3sqxM0o`6-^x zT}pKP>zp<{>F$#p+Xy$W&m6VsiM*)^P?0ANo;!|);B+@IJ`16p5lg?GO5)t@E%mOO z02Pt)hf@Xb|CpfCr)k_J7%?cQ{|?J%z0EDX1~1rSH7hxZ&fcGw;FViKz6libiPO3z0~EWkPFnK+gCI;xuSHSJbW@0yVSJ9=KNRjXWC( zEfBNwM6!M4?A(9*cDOy~aq07-Y-U*dzasOe*3^3T>s}r1ZV?hV9FAG6>~d@#dxog> zB@^1%2+GIgd|xW49mlAs>*^s`H+q-ieAGwO9vc}c>b=$2akhdVZlsDOQmL(>^NlvC zYBTKkGg?Zj(-_>}-UrrY&JBk4-DHwGc`w;7cy9c%#H8KIX+78IuK;&uQo~`q<00)Z!xkcLIQa8Ez-~#+mhFAs`@KG={qq}(~ zAl1Q^_```81davZ&;#6@%X2rRiPhvycVOK!%>ab3a<_o{T0rT&_VW9Qnw~bj+`@iv=%!;`h0>LXYLg7` zagsktB&=BMlJO9=JMB5EAJ`EQ0D7`q@jyK3EyJ>*&;W2o;u8?a=HWn|xpn|;9uccB z7??aHA8jAD9SrAP?R#j8*jPbg!}SRn0*|i+uY{v z?eD2_)7YMYnS|0|jInLX3U@hPFa)xEk}oA$9c%l($+owPo9QqF(#h+E#C9li)t}9S z^{vdeNBGn+WH?|Szm$viu?ExMnY)BL63gBxz5{rNKT*`JY%wC>wRNJx76tB*!1>^* z!^=L*uhG%DTgIE_x1Aa1=U9BfwDs=@#aH9SR~lsVXkW>#kyd2j;d)hGM#21DAS#hc z15DTGJTl8SO!nVIpND*Mv1*R2$-`<8DFO_AK_81kEbcfqqceSf2nGgVtS$xt3W zEIrLtj-a8w7qeDz{?VdZ$2H0&yA)T%NS)$|Z}CqlQc1O6KtnK7jW#_x@lylLuw~?l zbffh7{UbW~HhA|XR%F=JB*6?XwGh-7HQ4Ob8NfGu+p_5NF5xTK zlB)V_O8q+u2OY1xLF1kTeE#!mg=wqZw|+Z9NfPkB6|D#GS#hz`Y{J$^wu@B*(an74 zWyg^kJGE$#j8QXNYPQHd@<6#+n{K{Mf?aI~ZBZe&xjqv};@dd^98W|@aIo3l>Gd>L z50+80=k~(1ez{xIRxwC5(9P}aE%MV{;ZmHKg>Ce(l|s85J6)#en zvQz}9B{HzD^8+5ZfzqyG|AUD{g{w_-7jm8yk`M3vgmtl6HN0aJW96#~dU!cZ^Le*SAg4o$F-PAhWZ30^2ia-)<-|>b;cbra5j*%_j}hTwr(Ht{d0W zi?tOgzPgyV7G*z3n$g3Gj2v0OaVe#=ccYCPEqe0#kQ0d0LM2NcU}e(exYDmDs2g)T z&X1vuuDq=w@Wh|jqoUg;uoB>$K&3AZw3HlAOm-qg(2)y?c5p6|>^9cYEP)uC&C&9Q zFv_r2uF%dPi(6_^OPTJx|CVm0ZdPnfgROgIXWZwb9+Wa9J;FzX5Jcw*gRG$PrKbf=)6k%^Cc%9G zgz6_^?+eV*bsK8L^sdHb!nupD59RKPD_liONb*M~z*zhyG#oHx$lM1F;zj*J%>5BM zG;)p+>(BcL2mE!aN(LQd&S@-jEAOO4tW7i%oIswIYU|OFvO7*O+ad{igsEcYu;f}D zb{!3hAVcSZpNe%AW;S! zvjibwc1@2a^Y>QVpXtW3Lqya}Gb%cIlZ1rj(>j6U* zAT*jDa0mzgzY!0p8tpOJ`IHQzG?wPu{&1WOSl}ioKDg4mi%Cz?Pj-X1|Fo+&B&dt< z$ca}Y!;(1OZ<>qWz~`O8{Vv4JMSXki&Pq^W^aS%hK_l5c%e=s8pExKeQIZxWu!#DH zXBojs0T&7Rd^QN3do;vzS!LKTYy$4S#B2?kJNFs5TTG@gMBR1!NZ^R`c5_II?X(Sy z?t8(GkjcM!;iRlBM0l6rn}?8J#r{v3&{VSz(0A`gW0YBa@}Ba@9;IL<0=r1$sF~A6 z-9)C8C3Y<+a>$^rKPE?pyk6^Yxr!@tTuP*ut!FGeQQYwDUMzO!#rqBhsBZBaL548WFg` zRkY>Fr*M?XmRi0IV7i_9guJvFr8ZO5lAc_Tc+qyY%|~SU{eb6=;pV^IVRi;_ElwIZ z3cT%;dMWy`D1^&RGYf^m9y($)lgf<+tHRL+y>djG>E^Q*V1Q{OzB-i$lV9UuEB;Rq zs$@UKVGQuVJ{!*gIfZEB)RcG3XZpw2Uec*x=N1qI{sDu9Z|C*gm9rY}gb2)?y z5qMeJTI@N^FBtw`K7hM7m_pd)2Un*64=R|F9HNCM8&6xdME*a&P`gz4T>+wg8*H1( zGFAYM#n5^S(`owaYj|@T%L$IJ%Hh*EU{5Doh0|a}RHo^T4c&#^j4KZC(%Zps*h4CN zzZ%iJgRTXo#U$myfab1rny0!&vV~DtrLo3oKq-!xPb<&9p{LFv;xwp~$5k$J@=d?X z!PzFDt|BBCmo3!0+AEpXrRJPjri1~;R3YKmGu!TE) zuAM-iO=v*! zL@sxR4BR@CqKs6BaCL3=N6-VEQ&8E5WL9+DuN>zGCLr|)$_?364(PmeeywGuB$Yn&a`P(3gdpi9sH1iUK4WqL|#F;~v0oDx^$YZToe7+@=r( zp}I2+t~^WGy1~EAh|HidCt2OiYTdWxuTL+eu;$_GPPF^y*BasI;rZa_imf}ke-Kti zykaV~PX|SeVGBr6Vrx4m6UfOoySfnbh>|rgG7^Bu&f8nG zI5{ViCpFwWspBGM3SY3=o_xPq$;5BdOvC+Is{xhJ^W#ewq}Vst(*+iGX~w?tCs){e z^j_oTl+6B>NgKXY5(9u}i|)c8XZ3k^i?{$B&NyUj4=GI$KgNXv)ln{!KJvhUu6q^VVRNNcf$!Ml0ApF$r2WM`nrIzvTt zRbZG?tJraFcl?XrpdtcdJ|PA$Ra96vYQaGSM6N-#hcw>HNWEuF8<_A=&r?^<_-3;P zMhKld)I~9h;OR2TVB0%|s%g!&Cjgi9Fi&rN69pUazxXpZbqI(zyE21mxZ!Nqeg`*` z=oh7@TzZ4}dc>mW94ciOK2q|!L-Fqktv>ye3YPO?0Ipa)!8ZbcGK43<{@S{{1r$Oe z^_d+uy0Gj0rKuv0F*7(EapgO)p;6jJY5!jq!9T3Evy!FHv)1d91_an1%nL*-z%aC& zEU~8y-Jipbqe`$f$7x%!LFd~fPsFI1XE;4n6D-bw%fffx8T*^ZybUM#q3w}|q-eO$ zXgWf}t))n?CsXf&O%CZ0c40}KY|^Hw!%2Zu5Iv3uMu#>>;6~2{&3dJ=jd;@uX&iz) zJ!*WTs$cb9=SI3E5Ac=t00qS+`L<@N=-Q!1boo)T04_UQPB+3ejo+a?$1us zT@{8hV;tw$jmRrb`N>0_@Udd9kCD0kK(V^*hOm{=lyk_fqo~9VSL z=O-XP4NSd;TZXy+-Xs8|eO|mF0#NCj^FE57Dy2jz?RzH6ZHbwR*dEvPspLYHA#okN?-IF3mjZ_@L0m6R zTe-H%^8)*4+FLj3Gf|Xos56Y!HqoQ@eA-EDsZslNIV>3YOwBLRxZ=kBh~kcfs>im2 z@35ZyXZ8rYSh#fJr}u^%IImDw$(;azaUtgfZ7$(JVD;dIRQ=B=HK>I+b+wb?(xYTv9eNfxTcarqWhh%8nBGmh{=JfVn;2|DwsABXoCaI4{re~5Gs)E z3<({tZL^IKnk6^Aj0@UwBoh+Y2FbLtS1b9*HKUq;a_ZBRw}%e9vc7DVvI#b(o1Mo zMN>QJCCKvue$+^DKx?!b3>`Q_pV>zQWD*Bn=R5 z#k885aDjkTb4HOW;7zB92_DSU1C$gZV|gVLZjAE(kcMMWEFv%cgd~Rf4mlNdZExEW z3Ftzq;`J|Vn(9w<Zy`sN z>gMui^cD1Ygt*pD_IA1%(5>{Lf)woz{30sEbUU*}IEX{LiS+hnpw%2pYe$c!lSiE1*i))=!#3&Q&HijVnE3+;GJqC`}j$2`Hw*7(%d$XKv?bG!ca z$S1E`QZ%%knjlpp`K>HaC3Zj+L(#^gJ49bEIi3#g)tFIa$vm(et_*=a zG|7ifhYS%3zkS_Bo@&w_RSchBW0&!rKqdl?VqVef3r}#n9|h_&g`rWJYC$$4bN9-D z{`@jy#aLbr%z@yRS@2n$^$!%Ug1*t=O(m-Mg!y<#$V*A=2(UIFD}E?_GRG8*Vf!(0 zbz!ePvo7R|d@8KUP4AMZLd`Rb#G0BzU!qU-EsoXF#9*bi4d{$ul{o;Y*q;tNQ*CIJ zb?|^;A3Oubi;ELv;(LIxZ$n`(*p4RqC9I)=1#MwtNhDBbK}_<+C;yC1j;X_iDY70s znPbU9>0y|uN1aoX$5f`5ydpy-Ko>IR)EGpT%fq$MsFylD1^l7w%l|`fntAo2%-i_L zt$Ghg$J+{P6DkiI1~fZ-5I{C0OL*tz+(=}&T@6zHnsGR#hPzd=Iro)ZP6xue$E3zm z2k}j-ARG(Pwx@I%$AdOy4q_nTQbmVgONNAN98^<+$eoC66qOjL9p!a7liZzvdv0aPRy#+}b=N@I96do$_xzt($wvw>h2qVBhOsEyjI8X&Tl453a75 z2@d9u+x6h=ss_l%Y=gD{sgpw!TaV|9y<46qJM>1Z#4z}EvJ}zh64~v1nUaDOT{P#m=cvfj23`w7tg!Jc z+sQ;`%d;16Xo}d@KQl+Mb!?{8XnF_vqQ!W-lX7ma!)nW)9Wa{2F z&==oWn)uVpJMWj)#OP%zH)u4GY**@4p0D!4P!_BMeD~5-3zSB^k@gm=CInmcm16lfMgty zEMX8Juv6-xJ$Jx)WrS5 zr-Shj^=~|b7y4(-!X&)e{8JR42Ui-lD}yPgIE+mTN0>R#^Coh&t8PTVOxv0vP6TE7 zamU^)K9!X?j;dhF1R4-FU{^8W&@9PrVZ^0FAs2A;kQPWv6W-^Bj!+Uqo%m*|`%~n6 z7dLu2mzsZx2|R#H4_h2#lW8GxzAd0pqO_|8kW4;WC#xo}5t}HsQO9w|b)1NhIfskQ z3=v!=aixMtO_2;FuIlOR2tDb&+Y8ZB_Bk^=mY0h30(PSO&6Asjr=ab2gx5!u4lxRw z>ax%Pv1WW3_6L+C1<3P4B-qp6j-z(xJwkCG6ZbZWrsy4 zB}m%Pz9+LoJMzvYJ$+)jW&F6{`x2X zLWJs3PZPgn&s{?$d!tFb-*eLCRvcVip!i-aWuTC!ch`5>m!>9lbHc{nq=aFDNQu6H1((3)03%yz?8QDk3n5_cTIY8whE&V;hQPt)MmHd-haTj-= zr_9%E$B;7~$NU^q^Gu@1!zZVG5uW2&&nSILzW&o&dwJe~u|)*oJc2)bvK(Bk>zfip zh;qsqo*&E=QsrQ!2ujur2P3OPf&o$8ylCZrcw!WgnO^cVz4c324U#ntpRSe!-#$JN z)TxptG%u!aH#A-cS>(fHk2}sXmBnl0(As|qjC{Jlk7fl`Y3!H>aOj?b``%5AE#UG_ z54CEu3_gyVaTESOD>&e(Wj}~&VVQLq#__;+Cpa4@u)G}S;MA?&xz(GgKF&&uJcK)h zC9dg=?%1dQo<@Pd7z?i)o8rQOy~RCFhTDV>!_YTph0{|@J6IGD7O9)EiBb{qaB3*? z$fA<<(Nv~h4+U6a5tA(1SN}RZ4XV*=P8te`{Jg4)4*EqJS_@@yK67|K9uqz18^A6% z!U;elEelS%_?u{XA*$qR03-#QJQc18fxiW7VBz2#TFNK7KKIElEVvV+)_T0g0ks14 zHe6aJx%7TVta3ZlD85@>Ejt!gQBRqe19<1-)Vu2OmPg#v?V0D5!URKj;1)f`B!miS zcsyy;S=bw#+G06J@x5XNKd{8!(0e6U#5vmGK}T`26t{;2m_{OYgXJP&rNV)2fVi*p zOSr=KhgP97Z3AQZ6(&pqFw`L}Q^^I5GkA=sQtb#P$#R<1Ri036u2N?9vuV%L*--}r zy*B<*QLK-MDP|o;RLqX1Q&c}gu)##g4=KK7D1t(L6%nE0pPv{)o7Fy;Z&PHp9OGX+ zG-l~jQS?-@fMcO|mH%tkn_tE5KyY!DW#T8hfY-Bu&qZ`XL#~%*LG9%V^73(ly4_GP zGK7rZ)d<}=+6%LpeItE%yC*S)NgzdYB#@hSDanro=Q0>&rZwJEZ~>3F1b|WRL06kbd+iTbI0M#kNO3;ZWIF=6*KE+WxUPU>v_ax=RJKM0yxR2fVDR&voS8GkN$0o z67h0=H}AM_7Mo(t;YItCuP~ZWz}$ zHOE)aLKA6gSIahVB2YN_94j9H&Sa0Hconl!?wj}iW2Nen_Lv$> zbY==Yeyfmqtk0w1det9k0D2sL)qG?^D~T6Ni}UM6v!2qHo;}vrH-$C$`DHWa_Ur4c z@RF0zL3hbP?3F|F3tB6Kq+x-r8KnJX27M5a3EpnJoNNijhz8T% z{GllH{Yl&h6z&hk=~Ze($m}xocHxLecn+-iEUWQQLE& zKbKgvJ-kssGAmi^FdGBW*X>1;p=sCj+$14`%(~TYrj#W0yV$kI=SsHHbBCV$@eS7! zR^8Q5V|Ac5_w$e6Z+3o8THfM8|r`_cDVUy2Q@}sRxb-Xh*18 z*Ga7$qm85EVkCYRuxZl6S(mI$@d{0SML#}y;%`<9!oD9b%Z~8aej2*P*#Pd=QE!)B z+v=&e{C)oqw=X(WAe_rTtUkXz^uRJ6Kdq^B`Z8Q|O|kJ(oa*@oyE{*7d6B8fO1N81 z>Y3}>I0XiG(Di5H5Ly3jT8UzG3D|GvZ*POBQH z{S;v*Udg?h@PWurD2xN<*HA@2lsF87cee$aGnG`HjJy8}&B0{#sm)f((U@4`&bQJf zl(@o{2o=KCWXa!8pgfxNp3agQdQU+E>-M5&PhsLxo15L6FC(M3x=wtkzstknOVa7Z zJ!LsgyzApwDJrxGLO1_G{M5fK4siG1?8yj&vkzuG5 z$aI(EAFhE==L*3bM~2;6i~iZ2s+mACUnp^y0+WpdJc0QGd7h-oZpg8K5TNb4nr@tc z>df)lpUqSBvglHe%MbqAS7vY3_S$kpJ zeMfM!9}*FD^u$B#Pj4`%&n^H|5jR;m&ixKy7qVcy?OKjqrszMs=>{*X!%@;N9skYX z`urHUT(E@HplZrV&Q8XE_g%wMv~PAt%=_A_Hiv78j2AUst@(-%cTl zCcjieqNze*fK@*Hk@^!;fWDulEh8L3){drc7GIk8x!Wd~tQAd$!{eZ0sXWJVi1Jy_ z7XGu#UZ)}%nCx7JV7s8{8QkP#YW%VAeTJ38ig0Dg=`RuavY!^ul(&87p-t81)Tm<$ zWSI~!u@ZEOHfdHgSBrx;aj5&x;Ab9}ZOgsVno|VN_JxU_9*2GwqR;%!6Ta5=WuNJ;+TA-NQ(u38%66{1Ge|qQNz78Slij$AEp#!=V@BVU5=_vG@MQBs%~#VQ z#3~CSp*STes_kjbtHpQu14W? z6V-?DKJ)9j0^TR&g~ zFWy^srygb8q>O{8IrSuAfiz>;p-@WJi7~wQY}*Jyc}wbU zoWuhXls7Uber{x_{BE0Wa56`*6Ol)Z2VjF(@k37qBypdI4HGN#KQrh8D53#8N@Q%Pa!%i-uywQ7Qmb@)e z21l(}e)f^~Y2inXnha>Ll67EI+7pSamNY+9e+{J(0WTSTR8%bvyU;8~8l+9}-Bsg;_pOdwyzPh?B zn>0W4y(_Llr>l7ZxRa?4KwQ4)H0Rg~WQlR35r?B#mAFDJ;zaIoiW5T7JbvI=@r9i+ zYpEOMqJrER^yS7N?otP=?yB!K-h-6xMMi9L^W!Kr6%5< zG%MVn5&-3(wot$26UY4U1Om_jG0fKejwz+W+mxSirY~=oyQl2CkxunFQAtENT0}!H z8GvtLOe^s{r5Bj&qgFy;`xDp;r&PPUiaf@^fnK`C5pa6?=*WCBHDD#vXh^Pr^FNf7 zR!xzVfeBo;bA7{k-1~7Ki*PnyYVQpSqTh6YDC8%DEQ3;))ycea0uiL9sBvc_FRr2lUhOdyEodIP$}= zeXfFJb@L59d6Nk&;94>&FbHF9J&^Ge10Y&tLyY9XIsAAVz)brX4sGWfdryaG7fu@} zCV=Uace699;yfJptFFv^m1CF`QNcAzpsmj5h7dZd;C?U9u&`dLMNbUB2EoFCD4tkI zra+U}S!kqD;Zr6mC86-_N>tUrcidae6 z4IEu0=rS2aYY%}Mk{4Nh5JYPu4)#gHS=PvalghQ(y|MA=>R-Lh^q>+?a2D1rG9MVg z=9AA$%*Et>@oJK$Hm$E;5XW*zEM}~G>ps%VPsO4hP#z1LNP4xNvbF^^R_M+-^$Uo< z*S$~tRnI*M-WDR>_S^ruoXzTTEJW>fJ1_ZnA29f9)FitIUq;c!M0!ujx?;tCo*MKk z9%SGqp?Oyn8Vn3P+4Uf`fmiyBj0Aj$r;CZ&qUqctv$HxZh~e%=%C7)wvp{s2)A_!v z`2CB4Pr)w6$=K*YZFyvJ5F^>+NWlO#l5Lc%EJ+@fE4+%{5aBF`IzH=Sk6xQ&U2kw!Xa4PYEou3}5gcy7w#dq^Wj z`a%D+m$=8TLmmVHZ>)^_lOgxZPIOHbx3nK~=1j^>_y@rVQP#}C;IYIPmSvc=>=<#Z z>oRbxz}C;=H{JEy%%8#-%x9;H4&6rT&J+?Jq?}b0lE$HfZI!6t;X;N(&G%$0X1{!- z;_h4`)E!(4I%=$K_G!v@-TUSnL0%Xhqi!(>_;ZRoP`m>7g>ZBRrezt0aviSabM&N4 zy8mTjaq!;!#YeGs&+byha6+%czo^I+#cB_kzE0Vy9E&j?H{5T3yQXnRiXxG&^`~uv z06?u>CQFDr9AH~Z8I0G2R0q~01zKRh&T)bXS*m*I%e+$Ac{d!c61WUG^*z-lVA|HK z=Dv?$u5LLFdHwQg!Ulw?9`bFEY?M(%SH3NS$b;EU$mSxK`V!s$+Yas^1x_}-hhWvo z8`T;^EfELu5>7WpB6frl#8&QCK9l@1`;`SQhoGE2fE4P$5`@YH3c8fRqn38`18b?= znS|y_okxp&$0#&o<5O53oIMxxaRSOJfs12dV_%1$h?RTd{*hS$zQ6;u7TWxol4$<2 zo6Fg!Ha1c!W$TAFKoZvJ^6=CiW-}eorvX0Ir&?afQ}lXf;TG=!O6fp^@DLiAYQxgn zDqQ^1>VMi2qIgI#44@=5tI>jpOy{v-$T5Oo?JT{KqzzG1F4^tM(gs^dND14u_B1>g z=|IKab=En)+u^lSilK{`0jed(0;lnBkWOBVpAY@wB{EPJQ3J8ZY|5=9p*1-ZGqR>X zH%b#y;Ve+XJl>uOja8Skisy>F)i~O`QUY3N1A}JazXO1qRQ&=Oe6$F3I;4g}N94eo zDeVdlQZ+VfYSlC(%%q@vgG{|weQ6SJSds$hcKDvZ+-q+HCVt+p*Q&!`v!kQE5`h~4 znQ_LcIOW|)r;J|2I+5ywmxKo7{JI)L5+}54VM3`$O^g`z)+-@_fP=L*t8h} zokJG`r7Y7@B25C#Y;=7k*uRJZe(%G10iGo4S40)x_-u@z@2C#aF+9CjUaA6p_$XAw z<j9|83#WlP2#gpF_m3XD0 zIA^257R@=4Aoif=qZF)Ln)h>adIID}z_2cy!;~87TyP7Q9g6*H_O)_SIrkqa5E327 zK0t*wn@=~bPF+2aAQuc=aRyPH@9Qy=?1*;3d&Dotm~Hn z3x?w7yQUx7D+?qSuY0NCA>H>d=MKoti&r6mCU%c2spw{T=BfN_1Q2T`c^NLgl0O7T z$;)7vIz1_sp9pPX)`ZKhO5S7}&q4#OKD@g?dX_9{NAajL@K1gaif=qV zEe5S7A9%W$Cr;;3hl&`T&)a;_hnpMI;}JDg&$+j*KRm2)g=IUV=GW`#TLxa*^fQQe zY9lzF97o(@@6MGZUu6AF_D^rU#nHtjHH5m3+LALNtwqP8zWP0}Q0G#JPrK%jab0Ym zbT7fDW%S)Vi08~aoT2r;MX%wc;!Je5+*C;iu3^mU_$VI2iyRX`({$2lekl4Ugus|S z!Dae3c~BVDvWKGn*H9Mao`YVEBA%@kQ^2(*Vm(UsgjqhtOe|N2fg)zmd*2@SPHI<8 zQhUnS7?4B9S3Aqhf(p}`jgs{Fhv8f`C~%y^N5ir=Eoy5IJBbr%y1WOLs;|D z$Z|*ETT&-J^knk+Slw#uwx;|i$5Kgg{^|wD1|-)mGnNQFd_N6fn>damv1cFia-E~# z)isYM@sVKb=$Lh64)`JunPc|?qpaUBGo{&LHBQ4KQD4vp*E)7d zNkvi?QU@2=J)yp2e&Qosnm|MxsgqT^`$4{p6bM3okLU4b`2qslu_LIN#evIh`bdV! zD9!(T=)~n(&U@cD`Lir{Kmg&beTsYpN;bV5Zzvy<6ZeCa0&CnuqA+K;WLda*5k{En zs3Tk`(7-PK=n+eTvfxvWz^_%e;{g;%k%LQr2&sF+iZ2!wz?rfNFhmai!6fopx#q$n z9v}-kMe2sfKUM!TCV2+zA5-iiN)-_!)FK{2(N-LF7GbFU6#7cH5PN3q`v_(wMCofc z_~sj%S?UG`olXL1oM!v#lf6jWY6~VbQiOgv8M}&p7>|H$6?M^54vk@q37hq#@QDit zt7i1Yyr`**3!jOBQFkNW<=?HFz!!Z}F)^t@M*ya|a=HK5z12V`^twT;h73_lG z7{@I?5}*4fT5tq;k2tKwtV2sx2T_ZS@A9YwQI^DGQ5MEQysniJN!O5h3@cUirBBLI z3prpnewgK+#+&rD_0`mDlRGj3LuSA?L2c4fCcRIn7k3ulZfy@Hm!p{{54ar}iuKD03$Q*N34 zS8?Eg872+A$b4XEBPrhe!t9B8163!X9u!c#Ude3FGMj zvarWL?wOFfSV?sJFn&xyrMs!~!Qr}Yfi6#@Bf<>H_6(e~|^sYYEA z^U{>71S=v(zT;?|pWqaCWHrYigw85exm|LBRguO#p>98oi)I_d*CZ5SHy3=WJrK&7 z%nx6)GVXA)K-i@Xf5c#LgD=Wb-b-H8{2r)5)oGt{#P`dV%moPZdXln5B9 zHu{d(6C2i#oOZY{6m}1GSoa44LV_cm4i-s-!yA2gt*3y?szr;$)t%TA3j41`=ZecI zdiF|(cU^D}e=7@U%l;}MC^m4Psz3dy0d3(6iwkTUqokL5Ad%K!GvT$D)$9aMP%bMi zib{?&l_vl{`!sQ%A?4MlC(=RbOCWHCJYbof>JFtp`1&cJ2^hMdqUH4+Hc_wmai#9?I?Jsi8C@euGr{w}sA7M6wn*^@y$RUEqbiU=gZ)x{x zY{~&Oylj>c1O-Gu3Vnfrb+Me^m5ikXHG0bb1{{ zw@*k5^F|)Z%56y)kRnGzquWb4H_kM^%P9pJ*sja`&u{2nyG;78kWxJ3JWIOe&}z>@U5MKtZ|7J+y&iTc8it&d{2bMviW{=D7=CT?NeQAm+E z0q+y@r67`Sr_K2kKBDJ^fk{ZQDbD8p0UM!&Hy1g^4hCWyl>x9pTbD>=DnNOOD$Kw? zOB2y05C__SpSguBTbp;E@fvmSP~v}Jx1<={LUSqXj&;+kq_Y;jgT5CYJv_D&TFB_C z<*9q1I_LA$=0zYO32WL;0p^yz?Ekyh)Dq}M7G~0=1m6e%nk#QS0EC2Dis0-Krk25%x#IU=k zk5{JA4eR?}jlUTV{j`s*lO7M3u}{m0=xY_O$QMTDx!zBgyRXZOH(a{8i+KS(S85UvB%o;JnNi2&W}Cj(v-3*p~mx=${kH- zac-wQ{ZLn^>igQEsSHpfLw#p;MPE>jm7`k1&hldUMW*acDQtkEqo+{VVWJWS50vnd z&}B+G7^Skl5z`{;<2qhkMVd>LV1KgIr%uHQ(Y6gmKm9apHRkV(4$D^2LK_Q)G0t49 zgGU@uSrDlZ+rR7jTbXD9nANWWdDOFvMB!phTxL^%Z+*nJ;UpmAAWKOI7hhr-6dL?W zhSv@^Yi?^tM9;4+b>hl;s6gxIcR;CQD%4gQgKp!I?{+}e>Pl@|$_C}B`)23=ZW@cPqG6?qd*5}R8W(jm^%xRnX| zo6@bUT^anz)g!G79gbu1XM=pe0t8}e`Y&L_Bt>=v=B0BnEtPe)U+GN!8udutecV89 zCaC_U^@Hg%RVt-vMp~~T!GfR>DVZyw`I_3*9l}jZ0v-fHz}dk3C?&31qst91C{+8? zE$)4E<_vf$0rTEQIZu-gV0M?@_DU9x|J%oR5TAg;;(SOe~o zKo7ngF{oioIP7-zFbWFcYEfi^PlYE5EB2FReE;Mv(7Q#G2W;JlU9O@8szy1-SsMSvYKNk7?J2;y)qZffovD%F~iV1tS1fs=H}^9aL@nW66W;Xi}Z zbG_5MzQF0W+iw4xo(rwE4V6fO8{w-bt3T=ro=50e&k_ZoyL~A)Cci#dxpa~AX_TCJ zOSbTF#-d~P!}NHzY6@82G6~F zQRGU)_`^Naz?MZ=)DH-EaR-Zj3OmQRyK3_J5Dy-0^)fUN5_SnU6eV45Tcg*|9p+7V zq&$Ot&Km|9wFgY2<2AB* z8o8>Od@Rdw3Mvc7sYPGz#gXT4HM79b?cBaO1M;dWi?mV9*>H zPSzs_8WsFOr^u$mMGZTq9$#!=T#}nl@c1a$BF3*0y=e8sP=)q^2{*}ZCyVvQ>>^YFD8`QLsQYF6kQ4lCn zieJHVp~Cb=S?OXfrw?3r*VMkMikYREtuZP}kr@6!vakdlF*Bv~%Cc=t@sTZ|UJtBiLZ9EkWN1&O{6DTj^(? zop}aOD^!&xn??bvOU(SWo<0L!`>2P%85)q%#cNu=&RTJxU#OiPQ?C_I#KM*U%>@sLcwlw;aADkrGv#`(J-pW6W?1Kjy4TkfP zNzq1%2pMzxF<|&yERC?F^1G9q7a)kaUBp6&)PMkuf`_ zUD~CD2<$X!i#`ukGCiDJ1|M;+?g){QgO2U*|)RtA~p^$bCsR5w&(XT1uA%}Z#8D4 z#|LXwK28SiSglr7M)xLOmWd(=bh+4W{0^TJ9eKHiH&aE10fAC(v8#~AU9A9&!Gl=A zp~bl?8(B$V>*ygHg()n>x2YOpFYu}<#_p4kJzKgM273P-c~XjkE4*AW=jWBfTo>+^ zd2|f&koj#;I&V>CQMk;V|MFcVqjcI2U)Wcsa&~XVDLzc>Vr+{E5i+u(T5=rCF1Fjg z(Rnw(l=X*$EqTq*6s)B(M*Tv=({hhzN)rNK$Z0sUhmkOL!sXUo3UMwRBZpT=$~Wec zXJOF{7oN|0xr+4$1H;`eqb#bP>Aj|+X0^#O;do~o82J?h;vBO!I3Dz+tfn>pwhi{| zI`V94KURnf<46jfyJFc7WK(QpK%J+Hco%jIIK0zOole7h8lVv+ea3iidY>Fvh>lwj z&LXtG&VaM|D1*beo#uTEJ^?`29I-AIH>bd@^P-R8am`kT|oHvdX8Oe??=(-8IqY= z@WLoLCDfE3k>-Rrx7Q*NolSe zDAjK;RP)1cel59TSV>+)+#~`WjPl+pnZ}u=$+f{LR81qWkW;t))O-?4u56rUUyK#n zPQ%y&wgh?ehZMDgf~TyS1SMy0McEF4d%@`tOo&eismR7o9- z%MYwi%Fa=;0~C6XcpV6Pb(s58+D1`z^iGqRqq#-@aazvJ`6ZMsJ@Ansuboi~$A^QZ zSQP5JXy53Y2f!sf=fC`Y5G_LgyapDpR>;MZbc&0Qjud&Iyd)C@3z^{!xG4Jrr*at&hs|qefkHFgt9q;O>*4oh|ze*<-7di{J(qM&IQ(xY~R$pQS zN&62%WTGI4&34m-KP*Q0B&c7-_z5J4_q@_*&816&fU;6VW5NMtMSNp_R{=UEsxu^= zr#+mi_kxtR5OER#6S-~{M#XM;I~r^7xT&_O)LNVcdmu$~;SxIOlsvS$Q)iY$zaEe? z{rcn0wso4hpm(AFQOf6o$pf)1%b+@YCg_G`PiG=dKTltN`s_4NDWix8+l)k~PWe(u z0IC5XyGPin8rWwNn;VwUiT(+YIf)$ZpW|3{x2%r4YR;=HZd`Dl|1GVRA}Wvj>EUNW zHTTVHwLT~wiVdxEZ9=}e5xYt+mkQqNcW7|Fk?C%ekZhUtzV=6y9ATL_#1%5JJ^GU% z{#M{EKxsPHJz)v3(vG;V_64H-yNYMIG5cP@Dp1~ab$h4qzW+@qZc*FR(EYx9c@Xyc z6;caU_&4HCaqD&Q_EIB+g)fJ+Z650>@tRj&h)q0)p3?;N`IcJn0N1;}-~!0)6k!g0 zE`Q{>|L|P`*|ACBVCD?dq9_BvB3WP)YAW{eFhS*)Txqrm%_BctlblfVxd=~52*LL= z%2LOwevZr`nS3Beo5A4E5v(wbsfuGTBdHW`>A0h|S8(<35`wObpRiJyIbTyVGZ;5a z4Ua3o@LIg!x)V{QXsGoFE6r}uYg)lZeDdBcB#X!lVT-Z2%iu91}jO^^53H5@>_~#*5$u|M}G}B^9h>f<%$Hl zW_AvoOqPX7SeVwJYt=Y(DTE4CCI@+*LwU)pS!&(C(JA(^TtJFyr!Vxor3{{9PEZU0 z@t*TrJk4;%%`zrwKs5i%Cd-7zzC~qBDUc_GgXjrSh_3M%@Y427I1l0-E5&i0ny{{n zr~#z2W<>(j`7CR5cSbD1Gtv(Q6N@tPRJQ_nR(FIuB`nY&PXem!F>j@X?>QbW;`8&$CKZt*uVctl202J-f$)1x{Pdtm?Yz~+)3_*g- zNO5l~WzKAlu)A>`bSQbVDPdbDR}cr1O1~O=qAQcvUNM5ogd z2*myf+-pkU%ko1^+SGl~{$K+mg^xyx-PRE0Q!tp->XsebfBWV~4=c#1aL{_0N?ieFdPn{JauDoI>E^uzaS}4(SkS$ z&XDmB37nXXpCvO5;Iux-?`$<-;kdh(*a+{jPMcl-F|gFu-HB1esG;}d{0PBH$f!jO z(gJYnS9wwSRrg#NHI;N8tG_d0o3)(Sw8{^yYJ>B*1=~1weX92z-^)P|L7={T+!*6` z>Q}#64z*iPsEwE?=OciH)*Gr7lWVesygf}c7j%Wj1a+#Sx~$X*pEQO zv|Xb(vEN#$&hR|i`zuhtXCU{Osk~W1wvN9gOd9a@DK!_)F*^c!1(bOFnT3a3-=*wu zqRWX2nx{sh6=c!if$>z$@%ntIq`?eJfQ-57B^PdsinF_G_C}s~s2vi&!6SSkh1YIi z^pf>FQ>wd=Dm>4B#FkA}D=F2rtb27+0-)7ImKgkqB#BOqWt3co@Kho<1Z)%dIT&yi zQKhiqe-Y8fo0g?A=g8LiM6S>dir`a}fgPfRXg0cc6ir#e2mrf6{XD^ZaPh)1oRTCr z>G~8j{}dctZSZ?>_D)VOOJj5!`~Vo>o|J+y|3d(v$%W)V7aeZ)|~*+dHRXOmZeJ|8}AK6*DG_ z$u`yy`f6c^HF*Y44>A*wd~t?P9FXtc06ebAoGd{Au3k1+--U-M&wyQcsId~U+9g?U zh>7S`7K0CMeBCmdtM%!N=2hq%GW#M29n*G;=2Sp?hhhtZppF&eV+(#Oz;a#f91ZD% zd~z|d;672I=0?cby$9*P|7h7veG?oYzUlqB>(zgX5uiaI{6=w>5QGVHb!yyIV-0VX zXApe=TV-$%(egDqPC_dRaH8mt(~8E+5ebxxvQ8>kj>7^{)3%*2$#A9;ZPbn~D{QFt zafW@`cbqLu6?FBL6}?T;_m8*k(XTM_WwMz}*K(@fPh)jSM>>(Vup$mJv2#LY%ZhI6 zj6ili>1Uoyy$qViu3+yU)XC?wlw?GEQ;d} z_IrbdvbPjTZYUZ(*5Ld^MKT~`&s`E9+G1-Nr1I}*600b20gBaQB8TpjPzGhgFLQG5 zyH<3Dj(OA=7uIK7Jd9Qe$BHCFnPHa;=7a z1pS;ghTAw`xctCdtXu?gePnn!I^nG*oEkERyJ1k891yP(frnr3UH%4RVfwz*HnB6c zim%XC`hQq6tzRbZ5`GHDvO#%BSh5_&#Jmg4Jidbt;d62sI|ZJvqi_2*z`)=el?|}` zep`YFfSWXEP{ZvRq2)3=XCWzMD#nAXa?)_rU0se}%-s0;D4av3%ViHkYZbZ#iKEcM z6wr5lp$#X{-^R>W0(bghQ_H`s^v$Y8iowT^IxEm>Uv2D})_IDFd$;gMYz3bL7ZSGP zb~}$9=a&jXjXT^yHL~dk*Oa`ZDQarvB!R#mp$2%1a0tIyL2JfH0^7jbgs_x=&3T*K zA>FShD~YB+!&~xVdQ~O94|PVVk)wPKzhgE8{upv=!7d`<*0ugBT`mUB-~u6NvW{cU zJCVtJS^QBQG-2F@?Mz6rzC%QT%OF20OK->WMA?O-d!m9!(kNW$IRJQdJ`hIBmP0_C zw(rU065#j|2-%_it7r9LWomOank*p(B30RUCpaUOgfP)rJhL1zoIs?%Ry}B+3$2ZV-d6Mbg=V*m#P?`N%jQcSz>PwBPLLq#VO<}KS^2$n_ln6$`X^kpPJ}#w+DC>ibSx=FE+!?@Nni9d5fV`)u)C1>02P4tW)HO)KhLUOlUDtc8`*ib%=HpTxf8&{VLAyTmA`y_m|Zj-N`sj>R~@ z;uw$dt5o(NG+@hn`2c;S`N?cWD1emFEu_oFaUL_2gFINoEON&Cc_*7}i$3U)%>|a) zCtl6d7v(!-P~=nua7Bi9h0R0Z)X`X_{Y#0iM@ZN}EMSi;+4ivCmNcDco0{T+wG$cA zg}MN?_u4+_beU0#Q2F+<6**rAJA4?1HZxj07MUOVBH?yms6334IZ19cli&=q;iDe@ zcJafdW*dkLz*Hih*5RQwVOKM0Qs5vee+Ny2{s1;ixN2$B0c17-dFp84hqrZ%FCc0} zWJ<2UqBzS-spq4(0l_v4Ib@Ovs6aJ2V&}yAUNaIUUdN$Cb7?bL#Oc*`c_V4B>>PvW zpasG647qnwH0O_`Ds}i{*K&iyw2ZY}$nh{oIB9aP{G?RsG`=mgoHs1t7pWJ*xbHkH z;?4jtZvk1L?69&N%Si;e#O6>Jg?ei<*ZuBwxTh;6>70{v=wHhWtGPayciOYL2OYGc z8F0fGVe-kLzTdS%AP7Wla)Rlm+*HKpcKVZ8(_^4m{?>hN&8{_BEZ31~OZs6vC_UL2 z=k2J%hxVGt|LH5q#d{g1;-{z7P=Zrt4{2RNp5e$Iemxr%c<9)MD?@I>#`xer;2D$Q zePN$k+O#BVsI~2A%7*6s@>@fJipF&wmhSr4Mg#?U;ZoHL$yG;TJRVj{V1ak_wmF`0 zh-66H=Nnj4%znUMe9H`E8Uo>9q2aI`nzr2}o$#1j_~}+LC%%ICymkrC%w9+(@y@x~I{oo4Hm3-4077T9KnQ6IRU@eMoRG0B zOIeC+)dmxplpc-3@;GpDapG_CO5y9527gToO+l46Pk6!aBFmxW+FxzIkB32dqHyJx z5YAeGm3-?(%|G=&x`0P@A;096WEeDvCD~4;9YLIek#{`eqL_;;Wj9^MVrKD+rL;?u9_H6AuuDV>cI*2!yO}LWQNa9r?m}y<0Wk9r$owTULlw`_wbb z#A#?u%}A;VERxz4x-ox@*D#J$!dCvuS0dF2fh3rAO#F(E$Q&9i{z50(89D3S+0x_okC~mu;%epfmK&+F4-YrD2sIqRujI0c;fWc-u%61L5-yS;Mb!hu|~S3A)Z%I^}(JPN1vN@GfCXaN>}}AW0qMV?^7jc3$7NVcDH>Ko#9a2 z76+QZbDn@;<}UO?ae+9&&^@aHJM{sYl9Uw&U8#IA+g^08V$Y>vO9M!g!lg}_v{nsy zJ-fT7-Bf)ak|4iG1ZXXd3|azGR?@KK%_ow=@)Vef9_BmKaM?vyW){dZ`EIHQ_0!#% zVAV~a>buO( zMMB*-I8_>}%cXkLF-9}#Id9DJr(jYC;ISV?^F)e`09#zPI?gh|*m;`F41x92+#a3x zU-63Npla7>C&A!we5yd&C0lsC)F@y*Cuvyadx%b#L}q>;Us}Yr=;~}Hv`Jw2>#Pn! zI_hPvUUbjxX^wiLJltE?KGL5*Vfi51k@u5SQpJnD&kpD^cw#Yj%-wf##?G5u&b?Gc zqy)Q)PJblrx(mLBV1Q$>@!x2ApG;HGpe43kgVE7cJ9g70{BdJhZo%Pkz22I}L#?n` zEdLWuE=c~fA*`~FaH<&#%%W11vjJi>#eyKr4fvb0#~`u}JwrndrGC=Tm<$L4ai!$P zc`n47g-#4XLYYh!iag*{c^{9K&s7f{g>qbgR4vv4F{;U>^N`uuS?Yq#SkbfJYd2Bh zF*N;VR68Vzw4io!mJ=l8xUss(K5<{Z3O78bIQ`3!&pLm(K_y8FF@$z!0+ZA_G(*69 zO0@wzU=v3<@gTy4s$^MK6z6l{DwhPC&f~6@4?mX@brqS^7J&3m%*R7g8gmkju0l z$EUfU3A4aio!NCzxpuxKg4}QAWeQKm&K!AWy6H9gm6%@{6i)NZ@dqe8;GZ;0d)$zV zMfNB`oE+l9biEouVq3YUqO=dshvi8d*9uy1jNXyqfL>|<(fglX6C%01HUsJsiDSuI zRsGw?CRF4&U993KZh)qjIN4 z3DSw;I;^turQM7(n%QQjE4kPVf zYhxL25etV66884$5khaYT=}Il=Tf%UNm_@5fpu>e(cP5BksH_k3?nBVvi@wjPK}aX z+cIgNw+2DgC-#lOf{_?3J_~Z(Pi!fspxkSp8?FF=KBi#J+e-}tzm^F#YaA=6l;?ss zQ*^BoxXtJQvm31|VcTgd6ne4idM2k3Yoy(&aRohs6Ojd{nOm7#RF*`U;&vKc{LqS; zn;3Jid!Uj1GG?@w}C|lhjF#+mWs?5Adq_F7X zn)%KZczJ{S+|XRi&VGd$cVvH;AeZ+_xV=J=oWD+)-XVmBTZdG+3el3tf$y}P#ta@Y z5>C1#_vdZRlAuGwMwF_S{o~twcPdl#TI;;qf{ru!AHtgaQ!B$H z{Ux3Q3NZJrL{!b$lk!XvNWoA7KS}+B=w5yNTxp>flS~Vk_NRmKa*D^BA z!3e^6uv92|fr#!BaNEgQ4`y(2n~iOagMC{erT$U>=9uPk+9pOeCqH{A?Z4$A9O^zk zY_qvO=8~zxh^eAlEls2jMn^llHqB7KExlr170wlxLrv>SS9qmPdCV82%q0Kp&44Nd zsP#)t4TUUb$zW!q2j%_XLm&z@LS8{B%l3fDdK-JMKStbPdB0qH{6sWx_83d^f;(m_ z{Qznd7P#n6(%S$0s0zQULZCScSC4Fh9|zh1ej$YN5t0F;3=~eI>D;aL(yN(3;aSm;0En?D7Nyc zI1lf=^&Vt3S+2)&-*^ebWs*NBaNpnUFTHqzv?n$`mg$faew``MQ88c9E9o;ynfTa9m7TEy+I&I4&XRwIMN1pK4ht z!;`Y*+6O?H*-4ABX$^csc`AG4r}F?kUoE7@$*^kb*kn@$7>5MXGWkRAL{BcGkw@w2 zO56wEG?44CL4K{%?t-TX>cVlgt=#YtTBr_syn%Z4y~Iy_z&Td+t10S0T8QLWt9m5~ zHgIYKC14hWQM%p%+a|PpDjSQJDkw5R;ddr-+j7gL0L}2QgQNRJ%|IziUDla{Z0 z^lw6&^55zt0u2Os<7P&}yGp8(;?{t+DS>*Cyrt=Mv{VL2MXX7Du66%LzZv7mW;B9X{X-dk9Hma+cw+>90Fj{u}|f zAU+@flx&Ia@2>uZaE&f}WsrXhNSc$gN&=gf2*Qa1~9k;Ao=vm!oK{^5%HBC*%}3Hx(xux(v;O3p(YKcE|j{&zm^em#VMeJH93-1hsF zwKtHVWl@AnV4SCExRvjC&$`V6&2)EtkxPs`jq#lJ2W$BSk^mG0y3n`7S(?K=gr}Ss zKtFa;DgKgC|Nh74OV-dt7a{j8H%w=3&yN}nf4jlD!LsrDlcZx9(u~tc{~8nPd?h2O8xfd{Tc(2$|`$m^K&9s7w#kGe3cU{LLBd2Qr9IRHd;h?b?R7TYqGloOkJw zkM#pS#74K{BlzIp!_7$o08sae|E46pCh~x#MW7KG9Zdd3hQOaQZj>aXMfm+cTu=YQ zmEa$)=l{(WXq!Ld=Kpft1_HIULJ|?d%wYVsn$yOsV5c2*8n}~GjccWMx^8z@qJZP1 zRocZf5~UT>AH@mwik@@tgWc$=}Kw^c2l_KKATXf{Ce7hWNw2N7(# zKF|QCHHTwNg7&BUJRzYwSK}4||G&ZH!u>axoqu!4rTq_>e_ps;`u~9Wmk`6B$@C8x zDEA&9V6fz(5lxVsJc)YBYYGQ7eoPY(BjR<7+cc%j+|UYUd?`;-&mH2-dyNu2TM$KM z>}qK9Mi+UHxcLgM!cxcuP1g_j?IkwF4M0#!;(Ot5b#+ZcK>*m{~wDM{FzGJgpx+%dK9g(e;qST zMi13cZ&viYGCXw6(CnT9&+Gb(6Cjy4c}a48m6nfApNfirN%M%dmb;Ew@k@6C#J>$~ zSFx4hbLHbu{Wt|)v+IxT-Gwa$-aJ#CYre#EjKf}gZs~@>wEo%N@qf1W;?H#BCY(>C z^b__40x}w01zF>vG)82dBIi0pIqu#h4I*lx81OnF<9QB3`Y978rRcR?Q`6X6j?bDN%V(?|SZLTY1?t*r?M*KW3s ziKC*y6gAsKwT~j&s}A_P9F0GB%_vDp>hEU+rhzieZ9$Ij2gH^CPodZUzw-P4ROU2s zFg6x%F}E@%kaMs#ayB$JB5*Z#G9wU`;};}gqGO_Cq;)WMaWJ+r)^{`p F`af|AzE}VN diff --git a/python/pyproject.toml b/python/pyproject.toml index 7ba4b4c6bd..6b1b032fdc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ ] [project.optional-dependencies] -srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", +srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow", "psutil", "pydantic", "python-multipart", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.4", "outlines>=0.0.44"] diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py index bfde4bbdb6..92f717127f 100644 --- a/python/sglang/lang/chat_template.py +++ b/python/sglang/lang/chat_template.py @@ -137,7 +137,7 @@ def get_chat_template_by_model_path(model_path): register_chat_template( ChatTemplate( name="chatml-llava", - default_system_prompt="Answer the questions.", + default_system_prompt="You are a helpful assistant.", role_prefix_and_suffix={ "system": ("<|im_start|>system\n", "<|im_end|>\n"), "user": ("<|im_start|>user\n", "<|im_end|>\n"), @@ -145,7 +145,7 @@ def get_chat_template_by_model_path(model_path): }, style=ChatTemplateStyle.PLAIN, stop_str=("<|im_end|>",), - image_token=" \n", + image_token="\n", ) ) @@ -322,12 +322,17 @@ def match_chat_ml(model_path: str): if "tinyllama" in model_path: return get_chat_template("chatml") # Now the suffix for qwen2 chat model is "instruct" - if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path): + if ( + "qwen" in model_path + and ("chat" in model_path or "instruct" in model_path) + and ("llava" not in model_path) + ): return get_chat_template("qwen") if ( "llava-v1.6-34b" in model_path or "llava-v1.6-yi-34b" in model_path or "llava-next-video-34b" in model_path + or "llava-onevision-qwen2" in model_path ): return get_chat_template("chatml-llava") diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index 5ee1216974..d5ca327703 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -34,6 +34,7 @@ class SeparatorStyle(IntEnum): NO_COLON_TWO = auto() ADD_NEW_LINE_SINGLE = auto() LLAMA2 = auto() + LLAMA3 = auto() CHATGLM = auto() CHATML = auto() CHATINTERN = auto() @@ -137,6 +138,20 @@ def get_prompt(self) -> str: else: ret += role + ":" return ret + elif self.sep_style == SeparatorStyle.LLAMA3: + ret = "<|begin_of_text|>" + if self.system_message: + ret += system_prompt + else: + ret += "" + for i, (role, message) in enumerate(self.messages): + if message: + ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n" + ret += f"{message.strip()}<|eot_id|>" + else: + ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n" + # print(ret) + return ret elif self.sep_style == SeparatorStyle.LLAMA2: seps = [self.sep, self.sep2] if self.system_message: @@ -379,12 +394,23 @@ def generate_chat_conv( conv.append_message(conv.roles[0], message.content) else: real_content = "" + # calculate number of image_url + num_image_url = 0 + for content in message.content: + if content.type == "image_url": + num_image_url += 1 + if num_image_url > 1: + image_token = "" + else: + image_token = "\n" for content in message.content: if content.type == "text": + if num_image_url > 16: + real_content += "\n" # for video real_content += content.text elif content.type == "image_url": # NOTE: Only works for llava - real_content += "\n" + real_content += image_token conv.append_image(content.image_url.url) conv.append_message(conv.roles[0], real_content) elif msg_role == "assistant": @@ -425,6 +451,18 @@ def generate_chat_conv( ) ) +register_conv_template( + Conversation( + name="chatml-llava", + system_template="<|im_start|>system\n{system_message}", + system_message="You are a helpful assistant.", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep_style=SeparatorStyle.CHATML, + sep="<|im_end|>", + stop_str=["<|endoftext|>", "<|im_end|>"], + ) +) + register_conv_template( Conversation( name="vicuna_v1.1", @@ -437,6 +475,17 @@ def generate_chat_conv( ) ) +register_conv_template( + Conversation( + name="llava_llama_3", + system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.", + system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>", + roles=("user", "assistant"), + sep_style=SeparatorStyle.LLAMA3, + sep="", + stop_str=["<|end_of_text|>", "<|eot_id|>"], + ) +) # Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442 register_conv_template( Conversation( diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 328519cb26..2d604d2879 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -131,11 +131,49 @@ def __init__( self.model_update_lock = asyncio.Lock() self.model_update_result = None - async def get_pixel_values(self, image_data): - aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None) + async def get_pixel_values(self, image_data, aspect_ratio=None): + aspect_ratio = ( + getattr(self.hf_config, "image_aspect_ratio", None) + if aspect_ratio is None + else aspect_ratio + ) grid_pinpoints = ( - self.hf_config.image_grid_pinpoints if aspect_ratio == "anyres" else None + self.hf_config.image_grid_pinpoints + if hasattr(self.hf_config, "image_grid_pinpoints") + and "anyres" in aspect_ratio + else None ) + + if isinstance(image_data, list) and len(image_data) > 0: + pixel_values, image_hash, image_size = [], [], [] + if len(image_data) > 1: + aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres + for img_data in image_data: + pixel_v, image_h, image_s = await self._process_single_image( + img_data, aspect_ratio, grid_pinpoints + ) + pixel_values.append(pixel_v) + image_hash.append(image_h) + image_size.append(image_s) + pixel_values = np.stack(pixel_values, axis=0) + else: + pixel_values, image_hash, image_size = await self._process_single_image( + image_data[0], aspect_ratio, grid_pinpoints + ) + image_hash = [image_hash] + image_size = [image_size] + elif isinstance(image_data, str): + pixel_values, image_hash, image_size = await self._process_single_image( + image_data, aspect_ratio, grid_pinpoints + ) + image_hash = [image_hash] + image_size = [image_size] + else: + pixel_values, image_hash, image_size = None, None, None + + return pixel_values, image_hash, image_size + + async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints): if self.executor is not None: loop = asyncio.get_event_loop() return await loop.run_in_executor( @@ -194,8 +232,8 @@ async def _handle_single_request( ) if self.is_generation: - pixel_values, image_hash, image_size = await self._get_pixel_values( - obj.image_data if not_use_index else obj.image_data[index] + pixel_values, image_hash, image_size = await self.get_pixel_values( + obj.image_data ) return_logprob = ( obj.return_logprob if not_use_index else obj.return_logprob[index] @@ -704,7 +742,7 @@ def get_pixel_values( tuple(int(x * 255) for x in processor.image_processor.image_mean), ) pixel_values = processor.image_processor(image)["pixel_values"][0] - elif image_aspect_ratio == "anyres": + elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: pixel_values = process_anyres_image( image, processor.image_processor, image_grid_pinpoints ) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 41f9083012..fa79f84921 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -322,11 +322,16 @@ def handle_generate_request( if self.model_runner.is_generation: req.pixel_values = recv_req.pixel_values if req.pixel_values is not None: + image_hash = ( + hash(tuple(recv_req.image_hash)) + if isinstance(recv_req.image_hash, list) + else recv_req.image_hash + ) req.pad_value = [ - (recv_req.image_hash) % self.model_config.vocab_size, - (recv_req.image_hash >> 16) % self.model_config.vocab_size, - (recv_req.image_hash >> 32) % self.model_config.vocab_size, - (recv_req.image_hash >> 64) % self.model_config.vocab_size, + (image_hash) % self.model_config.vocab_size, + (image_hash >> 16) % self.model_config.vocab_size, + (image_hash >> 32) % self.model_config.vocab_size, + (image_hash >> 64) % self.model_config.vocab_size, ] req.image_size = recv_req.image_size ( diff --git a/python/sglang/srt/mm_utils.py b/python/sglang/srt/mm_utils.py index e09c8215c6..7918f3f711 100644 --- a/python/sglang/srt/mm_utils.py +++ b/python/sglang/srt/mm_utils.py @@ -13,10 +13,25 @@ limitations under the License. """ -# Source: https://github.com/haotian-liu/LLaVA/blob/main/llava/mm_utils.py +# Source: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/llava/mm_utils.py +""" +Utilities for multi-modal models. + +This python file mainly contains utilities that were used in the +image processing logic of llava-next including operations such as +anyres and anyres_max + +Currently supports the anyres and anyres_max operation for CLIP and +SigLip. For more information, you may refer to the paper or the blog + +LLaVA-NeXT : https://llava-vl.github.io/blog/2024-01-30-llava-next/ +LLaVA-Onevision : https://arxiv.org/pdf/2408.03326 + +""" import ast import base64 import math +import re from io import BytesIO import numpy as np @@ -40,10 +55,13 @@ def select_best_resolution(original_size, possible_resolutions): min_wasted_resolution = float("inf") for width, height in possible_resolutions: + # Calculate the downscaled size to keep the aspect ratio scale = min(width / original_width, height / original_height) downscaled_width, downscaled_height = int(original_width * scale), int( original_height * scale ) + + # Calculate effective and wasted resolutions effective_resolution = min( downscaled_width * downscaled_height, original_width * original_height ) @@ -129,6 +147,26 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): Returns: tuple: The shape of the image patch grid in the format (width, height). """ + if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints: + assert patch_size in [ + 224, + 336, + 384, + 448, + 512, + ], "patch_size should be in [224, 336, 384, 448, 512]" + # Use regex to extract the range from the input string + matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints) + range_start = tuple(map(int, matches[0])) + range_end = tuple(map(int, matches[-1])) + # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1]) + grid_pinpoints = [ + (i, j) + for i in range(range_start[0], range_end[0] + 1) + for j in range(range_start[1], range_end[1] + 1) + ] + # Multiply all elements by patch_size + grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints] if type(grid_pinpoints) is list: possible_resolutions = grid_pinpoints else: @@ -149,6 +187,31 @@ def process_anyres_image(image, processor, grid_pinpoints): Returns: np.array: An np array containing the processed image patches. """ + if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints: + try: + patch_size = processor.size[0] + except Exception as e: + patch_size = processor.size["shortest_edge"] + assert patch_size in [ + 224, + 336, + 384, + 448, + 512, + ], "patch_size should be in [224, 336, 384, 448, 512]" + # Use regex to extract the range from the input string + matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints) + range_start = tuple(map(int, matches[0])) + range_end = tuple(map(int, matches[-1])) + # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1]) + grid_pinpoints = [ + (i, j) + for i in range(range_start[0], range_end[0] + 1) + for j in range(range_start[1], range_end[1] + 1) + ] + # Multiply all elements by patch_size + grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints] + if type(grid_pinpoints) is list: possible_resolutions = grid_pinpoints else: @@ -156,15 +219,24 @@ def process_anyres_image(image, processor, grid_pinpoints): best_resolution = select_best_resolution(image.size, possible_resolutions) image_padded = resize_and_pad_image(image, best_resolution) - patches = divide_to_patches(image_padded, processor.crop_size["height"]) - - image_original_resize = image.resize( - (processor.size["shortest_edge"], processor.size["shortest_edge"]) + # For Siglip processor, only have size but no crop size + crop_size = ( + processor.crop_size["height"] + if "crop_size" in processor.__dict__ + else processor.size["height"] ) + shortest_edge = ( + processor.size["shortest_edge"] + if "shortest_edge" in processor.size + else processor.size["height"] + ) + patches = divide_to_patches(image_padded, crop_size) + + image_original_resize = image.resize((shortest_edge, shortest_edge)) image_patches = [image_original_resize] + patches image_patches = [ - processor.preprocess(image_patch)["pixel_values"][0] + processor.preprocess(image_patch.convert("RGB"))["pixel_values"][0] for image_patch in image_patches ] return np.stack(image_patches, axis=0) @@ -255,7 +327,7 @@ def process_images(images, image_processor, model_cfg): ) image = image_processor.preprocess(image)["pixel_values"][0] new_images.append(image) - elif image_aspect_ratio == "anyres": + elif "anyres" in image_aspect_ratio: for image in images: image = process_anyres_image( image, image_processor, model_cfg.image_grid_pinpoints diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index bac0a05378..98daeaece4 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -88,14 +88,19 @@ def init_multimuldal_info(self, batch: ScheduleBatch): reqs = batch.reqs self.pixel_values = [r.pixel_values for r in reqs] self.image_sizes = [r.image_size for r in reqs] - self.image_offsets = [ - ( - (r.image_offset - batch.prefix_lens_cpu[i]) - if r.image_offset is not None - else 0 - ) - for i, r in enumerate(reqs) - ] + self.image_offsets = [] + for r in reqs: + if isinstance(r.image_offset, list): + self.image_offsets.append( + [ + (image_offset - len(r.prefix_indices)) + for image_offset in r.image_offset + ] + ) + elif isinstance(r.image_offset, int): + self.image_offsets.append(r.image_offset - len(r.prefix_indices)) + elif r.image_offset is None: + self.image_offsets.append(0) def compute_positions(self, batch: ScheduleBatch): position_ids_offsets = batch.position_ids_offsets diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index a885a6e595..76a0630fc2 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -15,6 +15,8 @@ """Inference-only LLaVa model compatible with HuggingFace weights.""" +import math +import re from typing import Iterable, List, Optional, Tuple import numpy as np @@ -26,6 +28,8 @@ LlavaConfig, MistralConfig, Qwen2Config, + SiglipVisionConfig, + SiglipVisionModel, ) from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from vllm.config import CacheConfig @@ -63,34 +67,61 @@ def __init__( ) def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None): - new_image_feature_len = self.image_feature_len - # now only support spatial_unpad + anyres - if self.mm_patch_merge_type.startswith("spatial"): + + # hardcode for spatial_unpad + anyres + image_aspect_ratio = "anyres" if len(image_size) == 1 else "pad" + offset_list = [] + for image_s in image_size: + if len(image_size) > 16: + # 2x2 pooling with stride 2 + new_image_feature_len = ( + math.ceil(self.image_size / self.patch_size / 2) ** 2 + ) + else: + new_image_feature_len = self.image_feature_len # multiimage + height = width = self.num_patches_per_side - if pt_shape[0] > 1: - if self.image_aspect_ratio == "anyres": - num_patch_width, num_patch_height = get_anyres_image_grid_shape( - image_size, - self.image_grid_pinpoints, - self.vision_tower.config.image_size, + if "anyres" in image_aspect_ratio: + num_patch_width, num_patch_height = get_anyres_image_grid_shape( + image_s, + self.image_grid_pinpoints, + self.vision_tower.config.image_size, + ) + h = num_patch_height * height + w = num_patch_width * width + new_h, new_w = unpad_image_shape(h, w, image_s) + + if "anyres_max" in self.config.image_aspect_ratio: + matched_anyres_max_num_patches = re.match( + r"anyres_max_(\d+)", self.config.image_aspect_ratio + ) + if matched_anyres_max_num_patches: + max_num_patches = int(matched_anyres_max_num_patches.group(1)) + # times = math.sqrt(h * w / (max_num_patches * unit**2)) + times = math.sqrt( + new_h * new_w / (max_num_patches * self.image_feature_len) ) - if "unpad" in self.mm_patch_merge_type: - h = num_patch_height * height - w = num_patch_width * width - new_h, new_w = unpad_image_shape(h, w, image_size) - new_image_feature_len += new_h * (new_w + 1) - - pad_ids = pad_value * ( - (new_image_feature_len + len(pad_value)) // len(pad_value) - ) - offset = input_ids.index(self.config.image_token_index) - # old_len + pad_len - 1, because we need to remove image_token_id - new_input_ids = ( - input_ids[:offset] - + pad_ids[:new_image_feature_len] - + input_ids[offset + 1 :] - ) - return new_input_ids, offset + if times > 1.1: + new_h = int(new_h // times) + new_w = int(new_w // times) + new_image_feature_len += new_h * (new_w + 1) + + pad_ids = pad_value * ( + (new_image_feature_len + len(pad_value)) // len(pad_value) + ) + # print("calculated new_image_feature_len: ", new_image_feature_len) + try: + offset = input_ids.index(self.config.image_token_index) + except ValueError: + offset = 0 + # old_len + pad_len - 1, because we need to remove image_token_id + input_ids = ( + input_ids[:offset] + + pad_ids[:new_image_feature_len] + + input_ids[offset + 1 :] + ) + offset_list.append(offset) + return input_ids, offset_list def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor: image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) @@ -124,7 +155,6 @@ def forward( # Embed text input input_embeds = self.language_model.model.embed_tokens(input_ids) - # Embed vision input need_vision = ( (positions[input_metadata.extend_start_loc] < self.image_feature_len) @@ -163,27 +193,73 @@ def forward( if self.mm_patch_merge_type.startswith("spatial"): new_image_features = [] + height = width = self.num_patches_per_side for image_idx, image_feature in enumerate(image_features): - if image_feature.shape[0] > 1: + if len(image_sizes[image_idx]) == 1: + image_aspect_ratio = ( + self.config.image_aspect_ratio + ) # single image + else: + image_aspect_ratio = "pad" # multi image + # image_aspect_ratio = ( + # "anyres" if len(image_sizes[image_idx]) == 1 else "pad" + # ) + if ( + image_feature.shape[0] > 1 + and "anyres" in image_aspect_ratio + ): base_image_feature = image_feature[0] image_feature = image_feature[1:] - height = width = self.num_patches_per_side assert height * width == base_image_feature.shape[0] - if self.image_aspect_ratio == "anyres": - ( - num_patch_width, - num_patch_height, - ) = get_anyres_image_grid_shape( - image_sizes[image_idx], - self.image_grid_pinpoints, - self.vision_tower.config.image_size, + + if "anyres_max" in image_aspect_ratio: + matched_anyres_max_num_patches = re.match( + r"anyres_max_(\d+)", image_aspect_ratio ) + if matched_anyres_max_num_patches: + max_num_patches = int( + matched_anyres_max_num_patches.group(1) + ) + + if ( + image_aspect_ratio == "anyres" + or "anyres_max" in image_aspect_ratio + ): + vision_tower_image_size = self.image_size + try: + num_patch_width, num_patch_height = ( + get_anyres_image_grid_shape( + image_sizes[image_idx][0], + self.config.image_grid_pinpoints, + vision_tower_image_size, + ) + ) + except Exception as e: + print(f"Error: {e}") + num_patch_width, num_patch_height = 2, 2 image_feature = image_feature.view( num_patch_height, num_patch_width, height, width, -1 ) else: - raise NotImplementedError() + image_feature = image_feature.view( + 2, 2, height, width, -1 + ) + + # ( + # num_patch_width, + # num_patch_height, + # ) = get_anyres_image_grid_shape( + # image_sizes[image_idx][0], + # self.image_grid_pinpoints, + # self.vision_tower.config.image_size, + # ) + + # image_feature = image_feature.view( + # num_patch_height, num_patch_width, height, width, -1 + # ) + if "unpad" in self.mm_patch_merge_type: + unit = image_feature.shape[2] image_feature = image_feature.permute( 4, 0, 2, 1, 3 ).contiguous() @@ -191,8 +267,23 @@ def forward( 2, 3 ) image_feature = unpad_image( - image_feature, image_sizes[image_idx] + image_feature, image_sizes[image_idx][0] ) + if ( + "anyres_max" in image_aspect_ratio + and matched_anyres_max_num_patches + ): + c, h, w = image_feature.shape + times = math.sqrt( + h * w / (max_num_patches * unit**2) + ) + if times > 1.1: + image_feature = image_feature[None] + image_feature = nn.functional.interpolate( + image_feature, + [int(h // times), int(w // times)], + mode="bilinear", + )[0] image_feature = torch.cat( ( image_feature, @@ -213,16 +304,31 @@ def forward( image_feature = torch.cat( (base_image_feature, image_feature), dim=0 ) + image_feature = image_feature.unsqueeze(0) else: - image_feature = image_feature[0] - if "unpad" in self.mm_patch_merge_type: - image_feature = torch.cat( - ( - image_feature, - self.language_model.model.image_newline[None], - ), - dim=0, + if image_feature.shape[0] > 16: # video + # 2x2 pooling + num_of_frames = image_feature.shape[0] + image_feature = image_feature.view( + num_of_frames, height, width, -1 ) + image_feature = image_feature.permute( + 0, 3, 1, 2 + ).contiguous() # N, C, H, W + height, weight = image_feature.shape[2:] + scaled_shape = [ + math.ceil(height / 2), + math.ceil(weight / 2), + ] + image_feature = nn.functional.interpolate( + image_feature, size=scaled_shape, mode="bilinear" + ) + image_feature = ( + image_feature.flatten(2) + .transpose(1, 2) + .contiguous() + ) # N, C, H*W + new_image_features.append(image_feature) image_features = new_image_features @@ -233,21 +339,22 @@ def forward( continue start_idx = extend_start_loc_cpu[i] - pad_len, pad_dim = image_features[pt].shape # 576, 4096 + pad_dim = image_features[pt].shape[-1] # 576, 4096 dim = input_embeds.shape[1] assert ( pad_dim == dim ), "invalid pad_dim={}, input_embed_dim={}!".format(pad_dim, dim) # Fill in the placeholder for the image try: - input_embeds[ - start_idx - + image_offsets[i] : start_idx - + image_offsets[i] - + pad_len - ] = image_features[pt] + for j, image_off in enumerate(image_offsets[i]): + # print("actual image_features length: ", image_features[pt][j].shape[0]) + pad_len = image_features[pt][j].shape[0] + input_embeds[ + start_idx + image_off : start_idx + image_off + pad_len + ] = image_features[pt][j] except RuntimeError as e: print(f"RuntimeError in llava image encoding: {e}") + print(image_features[pt].shape) print(input_embeds.shape) print(start_idx, image_offsets[i]) pt += 1 @@ -262,9 +369,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # load clip vision model by cfg['mm_vision_tower']: # huggingface_name or path_of_clip_relative_to_llava_model_dir vision_path = self.config.mm_vision_tower - self.vision_tower = CLIPVisionModel.from_pretrained( - vision_path, torch_dtype=torch.float16 - ).cuda() + if "clip" in vision_path: + self.vision_tower = CLIPVisionModel.from_pretrained( + vision_path, torch_dtype=torch.float16 + ).cuda() + elif "siglip" in vision_path: + self.vision_tower = SiglipVisionModel.from_pretrained( + vision_path, torch_dtype=torch.float16 + ).cuda() + # Siglip needs all feature tokens + self.config.mm_vision_select_feature = "full" self.vision_tower.eval() self.vision_feature_layer = self.config.mm_vision_select_layer @@ -276,8 +390,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square") self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None) - self.image_feature_len = int((self.image_size / self.patch_size) ** 2) - if self.vision_feature_select_strategy == "patch": + self.image_feature_len = int((self.image_size // self.patch_size) ** 2) + if ( + self.vision_feature_select_strategy == "patch" + or self.vision_feature_select_strategy == "full" + ): pass elif self.vision_feature_select_strategy == "cls_patch": self.image_feature_len += 1 diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index c599d8b368..3e858dfa72 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -1,17 +1,27 @@ +import base64 +import io import json +import os +import sys +import time import unittest +import numpy as np import openai +import requests +from decord import VideoReader, cpu +from PIL import Image from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server +# python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tokenizer-path lmms-lab/llavanext-qwen-siglip-tokenizer --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384 class TestOpenAIVisionServer(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = "liuhaotian/llava-v1.6-vicuna-7b" + cls.model = "lmms-lab/llava-onevision-qwen2-0.5b-ov" cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( @@ -21,9 +31,11 @@ def setUpClass(cls): api_key=cls.api_key, other_args=[ "--chat-template", - "vicuna_v1.1", + "chatml-llava", "--tokenizer-path", - "llava-hf/llava-1.5-7b-hf", + "lmms-lab/llavanext-qwen-siglip-tokenizer", + "--chunked-prefill-size", + "16384", "--log-requests", ], ) @@ -68,6 +80,81 @@ def test_chat_completion(self): assert response.usage.completion_tokens > 0 assert response.usage.total_tokens > 0 + def prepare_video_messages(self, video_path): + max_frames_num = 32 + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace( + 0, total_frame_num - 1, max_frames_num, dtype=int + ) + frame_idx = uniform_sampled_frames.tolist() + frames = vr.get_batch(frame_idx).asnumpy() + + base64_frames = [] + for frame in frames: + pil_img = Image.fromarray(frame) + buff = io.BytesIO() + pil_img.save(buff, format="JPEG") + base64_str = base64.b64encode(buff.getvalue()).decode("utf-8") + base64_frames.append(base64_str) + + messages = [{"role": "user", "content": []}] + frame_format = { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,{}"}, + } + + for base64_frame in base64_frames: + frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format( + base64_frame + ) + messages[0]["content"].append(frame_format.copy()) + + prompt = {"type": "text", "text": "Please describe the video in detail."} + messages[0]["content"].append(prompt) + + return messages + + def test_video_chat_completion(self): + url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" + cache_dir = os.path.expanduser("~/.cache") + file_path = os.path.join(cache_dir, "jobs.mp4") + os.makedirs(cache_dir, exist_ok=True) + + if not os.path.exists(file_path): + response = requests.get(url) + response.raise_for_status() + + with open(file_path, "wb") as f: + f.write(response.content) + + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + + messages = self.prepare_video_messages(file_path) + + start_time = time.time() + video_request = client.chat.completions.create( + model="default", + messages=messages, + temperature=0, + max_tokens=1024, + stream=True, + ) + print("-" * 30) + video_response = "" + + for chunk in video_request: + if chunk.choices[0].delta.content is not None: + content = chunk.choices[0].delta.content + video_response += content + sys.stdout.write(content) + sys.stdout.flush() + print("-" * 30) + + # Add assertions to validate the video response + self.assertIsNotNone(video_response) + self.assertGreater(len(video_response), 0) + def test_regex(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) From c9064e6fd9a5356ee579e9d452bfad725f8e6f2c Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sat, 24 Aug 2024 18:58:16 +1000 Subject: [PATCH 071/118] feat: use gelu_tanh_and_mul (#1193) --- python/sglang/srt/layers/activation.py | 18 ++++++++- python/sglang/srt/models/gemma2.py | 4 +- python/sglang/test/test_activation.py | 55 ++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 python/sglang/test/test_activation.py diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index a6f05610bd..d0e0626604 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -15,7 +15,7 @@ import torch import torch.nn.functional as F -from flashinfer.activation import silu_and_mul +from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul from vllm.model_executor.custom_op import CustomOp @@ -37,3 +37,19 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: out = torch.empty(output_shape, dtype=x.dtype, device=x.device) silu_and_mul(x, out) return out + + +class GeluAndMul(CustomOp): + def __init__(self, **kwargs): + super().__init__() + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + return F.gelu(x[..., :d], approximate="tanh") * x[..., d:] + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + gelu_tanh_and_mul(x, out) + return out diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 80b99742e3..37d926c34f 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -25,7 +25,6 @@ # FIXME: temporary solution, remove after next vllm release from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.activation import GeluAndMul # from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( @@ -39,6 +38,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -135,7 +135,7 @@ def __init__( "function. Please set `hidden_act` and `hidden_activation` to " "`gelu_pytorch_tanh`." ) - self.act_fn = GeluAndMul(approximate="tanh") + self.act_fn = GeluAndMul() def forward(self, x: torch.Tensor) -> torch.Tensor: gate_up, _ = self.gate_up_proj(x) diff --git a/python/sglang/test/test_activation.py b/python/sglang/test/test_activation.py new file mode 100644 index 0000000000..357a23319b --- /dev/null +++ b/python/sglang/test/test_activation.py @@ -0,0 +1,55 @@ +import itertools +import unittest + +import torch + +from sglang.srt.layers.activation import GeluAndMul + + +class TestGeluAndMul(unittest.TestCase): + DTYPES = [torch.half, torch.bfloat16] + NUM_TOKENS = [7, 83, 2048] + D = [512, 4096, 5120, 13824] + SEEDS = [0] + + @classmethod + def setUpClass(cls): + if not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + torch.set_default_device("cuda") + + def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed): + torch.manual_seed(seed) + + layer = GeluAndMul().to(dtype=dtype) + x = torch.randn(num_tokens, 2 * d, dtype=dtype) + + with torch.inference_mode(): + ref_out = layer.forward_native(x) + out = layer.forward_cuda(x) + + if dtype == torch.bfloat16: + atol = rtol = 1e-2 + else: + atol = rtol = 1e-3 + + self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol)) + + def test_gelu_and_mul(self): + for params in itertools.product( + self.NUM_TOKENS, + self.D, + self.DTYPES, + self.SEEDS, + ): + with self.subTest( + num_tokens=params[0], + d=params[1], + dtype=params[2], + seed=params[3], + ): + self._run_gelu_and_mul_test(*params) + + +if __name__ == "__main__": + unittest.main(verbosity=2) From f6af3a6561b2528531bcb4815012b085280d4ec7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 24 Aug 2024 08:02:23 -0700 Subject: [PATCH 072/118] Cleanup readme, llava examples, usage examples and nccl init (#1194) --- README.md | 39 ++++---- docs/en/sampling_params.md | 9 +- .../quick_start/anthropic_example_chat.py | 0 .../quick_start/anthropic_example_complete.py | 0 .../quick_start/azure_openai_example_chat.py | 0 .../quick_start/gemini_example_chat.py | 0 .../quick_start/gemini_example_complete.py | 0 .../gemini_example_multimodal_chat.py | 0 .../quick_start/images/cat.jpeg | Bin .../quick_start/images/dog.jpeg | Bin .../quick_start/local_example_chat.py} | 2 +- .../quick_start/local_example_complete.py} | 2 +- .../quick_start/local_example_llava_next.py} | 23 ++++- .../quick_start/openai_example_chat.py | 0 .../quick_start/openai_example_complete.py | 0 .../quick_start/openrouter_example_chat.py | 0 .../quick_start/together_example_chat.py | 0 .../quick_start/together_example_complete.py | 0 .../usage/chinese_regex.py | 0 .../usage/choices_logprob.py | 0 .../usage/cot_decoding.py | 0 .../usage/json_decode.py | 0 .../usage/json_logprobs.py | 0 .../usage/llava_video/srt_example_llava_v.py | 5 +- .../usage/llava_video/srt_example_llava_v.sh | 0 .../usage/openai_chat_speculative.py | 0 .../usage/openai_parallel_sample.py | 0 .../usage/openai_speculative.py | 0 .../usage/parallel_sample.py | 0 .../trace_and_evaluate_rag_using_parea.ipynb | 0 .../usage/readme_examples.py | 0 .../usage/streaming.py | 0 .../usage/triton/Dockerfile | 0 .../usage/triton/README.md | 0 .../models/character_generation/1/model.py | 0 .../models/character_generation/config.pbtxt | 0 examples/quick_start/srt_example_yi_vl.py | 70 -------------- .../async_io.py => runtime/async_io_api.py} | 0 .../http_llama3_llava_test.py | 3 +- .../http_llava_onevision_test.py | 9 +- .../llava_onevision}/http_qwen_llava_test.py | 3 +- .../{usage => runtime}/openai_batch_chat.py | 0 .../openai_batch_complete.py | 0 examples/usage/llava/srt_llava_next_test.py | 90 ------------------ .../max-tokens-fixed-rag-trace.png | Bin 134888 -> 0 bytes python/sglang/bench_latency.py | 6 +- python/sglang/lang/chat_template.py | 4 +- python/sglang/launch_server_llavavid.py | 29 ------ python/sglang/srt/layers/decode_attention.py | 2 +- python/sglang/srt/layers/fused_moe/layer.py | 4 +- python/sglang/srt/layers/logits_processor.py | 4 +- python/sglang/srt/managers/schedule_batch.py | 2 +- .../sglang/srt/managers/tokenizer_manager.py | 3 + .../sglang/srt/model_executor/model_runner.py | 86 +++++++++++------ python/sglang/srt/models/gemma2.py | 12 ++- python/sglang/srt/models/grok.py | 17 +++- python/sglang/srt/server_args.py | 19 ++-- python/sglang/srt/utils.py | 6 +- python/sglang/test/runners.py | 22 ++--- scripts/{ => deprecated}/convert_yi_vl.py | 0 scripts/{ => deprecated}/convert_yi_vl.sh | 0 test/srt/models/test_embedding_models.py | 2 +- test/srt/models/test_generation_models.py | 2 +- test/srt/run_suite.py | 2 +- test/srt/test_vision_openai_server.py | 14 +-- 65 files changed, 174 insertions(+), 317 deletions(-) rename examples/{ => frontend_language}/quick_start/anthropic_example_chat.py (100%) rename examples/{ => frontend_language}/quick_start/anthropic_example_complete.py (100%) rename examples/{ => frontend_language}/quick_start/azure_openai_example_chat.py (100%) rename examples/{ => frontend_language}/quick_start/gemini_example_chat.py (100%) rename examples/{ => frontend_language}/quick_start/gemini_example_complete.py (100%) rename examples/{ => frontend_language}/quick_start/gemini_example_multimodal_chat.py (100%) rename examples/{ => frontend_language}/quick_start/images/cat.jpeg (100%) rename examples/{ => frontend_language}/quick_start/images/dog.jpeg (100%) rename examples/{quick_start/srt_example_chat.py => frontend_language/quick_start/local_example_chat.py} (98%) rename examples/{quick_start/srt_example_complete.py => frontend_language/quick_start/local_example_complete.py} (97%) rename examples/{quick_start/srt_example_llava.py => frontend_language/quick_start/local_example_llava_next.py} (69%) rename examples/{ => frontend_language}/quick_start/openai_example_chat.py (100%) rename examples/{ => frontend_language}/quick_start/openai_example_complete.py (100%) rename examples/{ => frontend_language}/quick_start/openrouter_example_chat.py (100%) rename examples/{ => frontend_language}/quick_start/together_example_chat.py (100%) rename examples/{ => frontend_language}/quick_start/together_example_complete.py (100%) rename examples/{ => frontend_language}/usage/chinese_regex.py (100%) rename examples/{ => frontend_language}/usage/choices_logprob.py (100%) rename examples/{ => frontend_language}/usage/cot_decoding.py (100%) rename examples/{ => frontend_language}/usage/json_decode.py (100%) rename examples/{ => frontend_language}/usage/json_logprobs.py (100%) rename examples/{ => frontend_language}/usage/llava_video/srt_example_llava_v.py (99%) rename examples/{ => frontend_language}/usage/llava_video/srt_example_llava_v.sh (100%) rename examples/{ => frontend_language}/usage/openai_chat_speculative.py (100%) rename examples/{ => frontend_language}/usage/openai_parallel_sample.py (100%) rename examples/{ => frontend_language}/usage/openai_speculative.py (100%) rename examples/{ => frontend_language}/usage/parallel_sample.py (100%) rename examples/{ => frontend_language}/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb (100%) rename examples/{ => frontend_language}/usage/readme_examples.py (100%) rename examples/{ => frontend_language}/usage/streaming.py (100%) rename examples/{ => frontend_language}/usage/triton/Dockerfile (100%) rename examples/{ => frontend_language}/usage/triton/README.md (100%) rename examples/{ => frontend_language}/usage/triton/models/character_generation/1/model.py (100%) rename examples/{ => frontend_language}/usage/triton/models/character_generation/config.pbtxt (100%) delete mode 100644 examples/quick_start/srt_example_yi_vl.py rename examples/{usage/async_io.py => runtime/async_io_api.py} (100%) rename examples/{usage/llava => runtime/llava_onevision}/http_llama3_llava_test.py (94%) rename examples/{usage/llava => runtime/llava_onevision}/http_llava_onevision_test.py (96%) rename examples/{usage/llava => runtime/llava_onevision}/http_qwen_llava_test.py (95%) rename examples/{usage => runtime}/openai_batch_chat.py (100%) rename examples/{usage => runtime}/openai_batch_complete.py (100%) delete mode 100644 examples/usage/llava/srt_llava_next_test.py delete mode 100644 examples/usage/rag_using_parea/max-tokens-fixed-rag-trace.png delete mode 100644 python/sglang/launch_server_llavavid.py rename scripts/{ => deprecated}/convert_yi_vl.py (100%) rename scripts/{ => deprecated}/convert_yi_vl.sh (100%) diff --git a/README.md b/README.md index c118d6a1a0..04dd913baa 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,13 @@ The core features include: ## News - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)). -- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)). +- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)). - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).

More +- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)). - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)). - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)). @@ -227,19 +228,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - Gemma / Gemma 2 - Qwen / Qwen 2 / Qwen 2 MoE - DeepSeek / DeepSeek 2 -- LLaVA 1.5 / 1.6 - - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000` - - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000` - - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --host=127.0.0.1 --tp-size=1 --chat-template=llava_llama_3` - - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --host="127.0.0.1" --tp-size=8 --chat-template=chatml-llava` -- LLaVA-NeXT-Video - - see [examples/usage/llava_video](examples/usage/llava_video) -- [LLaVA-OneVision](https://arxiv.org/abs/2408.03326) - - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384` - - see [test/srt/test_llava_onevision_openai_server.py](test/srt/test_llava_onevision_openai_server.py) +- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/) + - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384` + - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py) +- LLaVA 1.5 / 1.6 / NeXT + - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3` + - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava` + - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py) - Yi-VL - - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py). - StableLM - Command-R - DBRX @@ -250,6 +246,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md). #### Use Models From ModelScope +
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE. ``` export SGLANG_USE_MODELSCOPE=true @@ -258,21 +256,20 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru ``` SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000 ``` +
#### Run Llama 3.1 405B ```bash -## Run 405B (fp8) on a single node +# Run 405B (fp8) on a single node python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8 -## Run 405B (fp16) on two nodes -# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily - -# on the first node -GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75 +# Run 405B (fp16) on two nodes +## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port +GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph -# on the second -GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75 +## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port +GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph ``` ### Benchmark Performance diff --git a/docs/en/sampling_params.md b/docs/en/sampling_params.md index 7d866e6929..54b03bf325 100644 --- a/docs/en/sampling_params.md +++ b/docs/en/sampling_params.md @@ -1,5 +1,8 @@ # Sampling Parameters in SGLang Runtime This doc describes the sampling parameters of the SGLang Runtime. +It is the low-level endpoint of the runtime. +If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API +](https://github.com/sgl-project/sglang?tab=readme-ov-file#openai-compatible-api). The `/generate` endpoint accepts the following arguments in the JSON format. @@ -140,7 +143,7 @@ print("") Launch a server ``` -python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000 +python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava ``` Download an image @@ -155,7 +158,9 @@ import requests response = requests.post( "http://localhost:30000/generate", json={ - "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nDescribe this picture ASSISTANT:", + "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n\nDescribe this image in a very short sentence.<|im_end|>\n" + "<|im_start|>assistant\n", "image_data": "example_image.png", "sampling_params": { "temperature": 0, diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/frontend_language/quick_start/anthropic_example_chat.py similarity index 100% rename from examples/quick_start/anthropic_example_chat.py rename to examples/frontend_language/quick_start/anthropic_example_chat.py diff --git a/examples/quick_start/anthropic_example_complete.py b/examples/frontend_language/quick_start/anthropic_example_complete.py similarity index 100% rename from examples/quick_start/anthropic_example_complete.py rename to examples/frontend_language/quick_start/anthropic_example_complete.py diff --git a/examples/quick_start/azure_openai_example_chat.py b/examples/frontend_language/quick_start/azure_openai_example_chat.py similarity index 100% rename from examples/quick_start/azure_openai_example_chat.py rename to examples/frontend_language/quick_start/azure_openai_example_chat.py diff --git a/examples/quick_start/gemini_example_chat.py b/examples/frontend_language/quick_start/gemini_example_chat.py similarity index 100% rename from examples/quick_start/gemini_example_chat.py rename to examples/frontend_language/quick_start/gemini_example_chat.py diff --git a/examples/quick_start/gemini_example_complete.py b/examples/frontend_language/quick_start/gemini_example_complete.py similarity index 100% rename from examples/quick_start/gemini_example_complete.py rename to examples/frontend_language/quick_start/gemini_example_complete.py diff --git a/examples/quick_start/gemini_example_multimodal_chat.py b/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py similarity index 100% rename from examples/quick_start/gemini_example_multimodal_chat.py rename to examples/frontend_language/quick_start/gemini_example_multimodal_chat.py diff --git a/examples/quick_start/images/cat.jpeg b/examples/frontend_language/quick_start/images/cat.jpeg similarity index 100% rename from examples/quick_start/images/cat.jpeg rename to examples/frontend_language/quick_start/images/cat.jpeg diff --git a/examples/quick_start/images/dog.jpeg b/examples/frontend_language/quick_start/images/dog.jpeg similarity index 100% rename from examples/quick_start/images/dog.jpeg rename to examples/frontend_language/quick_start/images/dog.jpeg diff --git a/examples/quick_start/srt_example_chat.py b/examples/frontend_language/quick_start/local_example_chat.py similarity index 98% rename from examples/quick_start/srt_example_chat.py rename to examples/frontend_language/quick_start/local_example_chat.py index b1e1658a2a..e1e4b62cca 100644 --- a/examples/quick_start/srt_example_chat.py +++ b/examples/frontend_language/quick_start/local_example_chat.py @@ -1,6 +1,6 @@ """ Usage: -python3 srt_example_chat.py +python3 local_example_chat.py """ import sglang as sgl diff --git a/examples/quick_start/srt_example_complete.py b/examples/frontend_language/quick_start/local_example_complete.py similarity index 97% rename from examples/quick_start/srt_example_complete.py rename to examples/frontend_language/quick_start/local_example_complete.py index 056245979f..00a451cf64 100644 --- a/examples/quick_start/srt_example_complete.py +++ b/examples/frontend_language/quick_start/local_example_complete.py @@ -1,6 +1,6 @@ """ Usage: -python3 srt_example_complete.py +python3 local_example_complete.py """ import sglang as sgl diff --git a/examples/quick_start/srt_example_llava.py b/examples/frontend_language/quick_start/local_example_llava_next.py similarity index 69% rename from examples/quick_start/srt_example_llava.py rename to examples/frontend_language/quick_start/local_example_llava_next.py index 5d8f752394..823dc7b0e8 100644 --- a/examples/quick_start/srt_example_llava.py +++ b/examples/frontend_language/quick_start/local_example_llava_next.py @@ -1,8 +1,14 @@ """ -Usage: python3 srt_example_llava.py +Usage: python3 local_example_llava_next.py """ +from PIL import ImageFile + import sglang as sgl +from sglang.lang.chat_template import get_chat_template +from sglang.srt.utils import load_image + +ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow loading of truncated images @sgl.function @@ -44,10 +50,17 @@ def batch(): if __name__ == "__main__": - runtime = sgl.Runtime( - model_path="liuhaotian/llava-v1.6-vicuna-7b", - tokenizer_path="llava-hf/llava-1.5-7b-hf", - ) + import multiprocessing as mp + + mp.set_start_method("spawn", force=True) + + runtime = sgl.Runtime(model_path="lmms-lab/llama3-llava-next-8b") + runtime.endpoint.chat_template = get_chat_template("llama-3-instruct") + + # Or you can use the 72B model + # runtime = sgl.Runtime(model_path="lmms-lab/llava-next-72b", tp_size=8) + # runtime.endpoint.chat_template = get_chat_template("chatml-llava") + sgl.set_default_backend(runtime) print(f"chat template: {runtime.endpoint.chat_template.name}") diff --git a/examples/quick_start/openai_example_chat.py b/examples/frontend_language/quick_start/openai_example_chat.py similarity index 100% rename from examples/quick_start/openai_example_chat.py rename to examples/frontend_language/quick_start/openai_example_chat.py diff --git a/examples/quick_start/openai_example_complete.py b/examples/frontend_language/quick_start/openai_example_complete.py similarity index 100% rename from examples/quick_start/openai_example_complete.py rename to examples/frontend_language/quick_start/openai_example_complete.py diff --git a/examples/quick_start/openrouter_example_chat.py b/examples/frontend_language/quick_start/openrouter_example_chat.py similarity index 100% rename from examples/quick_start/openrouter_example_chat.py rename to examples/frontend_language/quick_start/openrouter_example_chat.py diff --git a/examples/quick_start/together_example_chat.py b/examples/frontend_language/quick_start/together_example_chat.py similarity index 100% rename from examples/quick_start/together_example_chat.py rename to examples/frontend_language/quick_start/together_example_chat.py diff --git a/examples/quick_start/together_example_complete.py b/examples/frontend_language/quick_start/together_example_complete.py similarity index 100% rename from examples/quick_start/together_example_complete.py rename to examples/frontend_language/quick_start/together_example_complete.py diff --git a/examples/usage/chinese_regex.py b/examples/frontend_language/usage/chinese_regex.py similarity index 100% rename from examples/usage/chinese_regex.py rename to examples/frontend_language/usage/chinese_regex.py diff --git a/examples/usage/choices_logprob.py b/examples/frontend_language/usage/choices_logprob.py similarity index 100% rename from examples/usage/choices_logprob.py rename to examples/frontend_language/usage/choices_logprob.py diff --git a/examples/usage/cot_decoding.py b/examples/frontend_language/usage/cot_decoding.py similarity index 100% rename from examples/usage/cot_decoding.py rename to examples/frontend_language/usage/cot_decoding.py diff --git a/examples/usage/json_decode.py b/examples/frontend_language/usage/json_decode.py similarity index 100% rename from examples/usage/json_decode.py rename to examples/frontend_language/usage/json_decode.py diff --git a/examples/usage/json_logprobs.py b/examples/frontend_language/usage/json_logprobs.py similarity index 100% rename from examples/usage/json_logprobs.py rename to examples/frontend_language/usage/json_logprobs.py diff --git a/examples/usage/llava_video/srt_example_llava_v.py b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py similarity index 99% rename from examples/usage/llava_video/srt_example_llava_v.py rename to examples/frontend_language/usage/llava_video/srt_example_llava_v.py index 7421dfcdfb..085bcea5a2 100644 --- a/examples/usage/llava_video/srt_example_llava_v.py +++ b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py @@ -1,7 +1,8 @@ """ Usage: pip install opencv-python-headless -python3 srt_example_llava.py + +python3 srt_example_llava_v.py """ import argparse @@ -9,6 +10,8 @@ import os import time +import requests + import sglang as sgl diff --git a/examples/usage/llava_video/srt_example_llava_v.sh b/examples/frontend_language/usage/llava_video/srt_example_llava_v.sh similarity index 100% rename from examples/usage/llava_video/srt_example_llava_v.sh rename to examples/frontend_language/usage/llava_video/srt_example_llava_v.sh diff --git a/examples/usage/openai_chat_speculative.py b/examples/frontend_language/usage/openai_chat_speculative.py similarity index 100% rename from examples/usage/openai_chat_speculative.py rename to examples/frontend_language/usage/openai_chat_speculative.py diff --git a/examples/usage/openai_parallel_sample.py b/examples/frontend_language/usage/openai_parallel_sample.py similarity index 100% rename from examples/usage/openai_parallel_sample.py rename to examples/frontend_language/usage/openai_parallel_sample.py diff --git a/examples/usage/openai_speculative.py b/examples/frontend_language/usage/openai_speculative.py similarity index 100% rename from examples/usage/openai_speculative.py rename to examples/frontend_language/usage/openai_speculative.py diff --git a/examples/usage/parallel_sample.py b/examples/frontend_language/usage/parallel_sample.py similarity index 100% rename from examples/usage/parallel_sample.py rename to examples/frontend_language/usage/parallel_sample.py diff --git a/examples/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb similarity index 100% rename from examples/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb rename to examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb diff --git a/examples/usage/readme_examples.py b/examples/frontend_language/usage/readme_examples.py similarity index 100% rename from examples/usage/readme_examples.py rename to examples/frontend_language/usage/readme_examples.py diff --git a/examples/usage/streaming.py b/examples/frontend_language/usage/streaming.py similarity index 100% rename from examples/usage/streaming.py rename to examples/frontend_language/usage/streaming.py diff --git a/examples/usage/triton/Dockerfile b/examples/frontend_language/usage/triton/Dockerfile similarity index 100% rename from examples/usage/triton/Dockerfile rename to examples/frontend_language/usage/triton/Dockerfile diff --git a/examples/usage/triton/README.md b/examples/frontend_language/usage/triton/README.md similarity index 100% rename from examples/usage/triton/README.md rename to examples/frontend_language/usage/triton/README.md diff --git a/examples/usage/triton/models/character_generation/1/model.py b/examples/frontend_language/usage/triton/models/character_generation/1/model.py similarity index 100% rename from examples/usage/triton/models/character_generation/1/model.py rename to examples/frontend_language/usage/triton/models/character_generation/1/model.py diff --git a/examples/usage/triton/models/character_generation/config.pbtxt b/examples/frontend_language/usage/triton/models/character_generation/config.pbtxt similarity index 100% rename from examples/usage/triton/models/character_generation/config.pbtxt rename to examples/frontend_language/usage/triton/models/character_generation/config.pbtxt diff --git a/examples/quick_start/srt_example_yi_vl.py b/examples/quick_start/srt_example_yi_vl.py deleted file mode 100644 index 66c7d57126..0000000000 --- a/examples/quick_start/srt_example_yi_vl.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Usage: python3 srt_example_yi_vl.py - -Requirements: transformers==4.38 -""" - -import sglang as sgl - - -@sgl.function -def image_qa(s, image_path, question): - s += sgl.user(sgl.image(image_path) + question) - s += sgl.assistant(sgl.gen("answer")) - - -def single(): - state = image_qa.run( - image_path="images/cat.jpeg", - question="What is this?", - max_new_tokens=64, - stop="###", - ) - print(state["answer"], "\n") - - -def stream(): - state = image_qa.run( - image_path="images/cat.jpeg", - question="What is this?", - max_new_tokens=64, - stream=True, - stop="###", - ) - - for out in state.text_iter("answer"): - print(out, end="", flush=True) - print() - - -def batch(): - states = image_qa.run_batch( - [ - {"image_path": "images/cat.jpeg", "question": "What is this?"}, - {"image_path": "images/dog.jpeg", "question": "What is this?"}, - ], - max_new_tokens=64, - stop="###", - ) - for s in states: - print(s["answer"], "\n") - - -if __name__ == "__main__": - runtime = sgl.Runtime(model_path="BabyChou/Yi-VL-6B") - # runtime = sgl.Runtime(model_path="BabyChou/Yi-VL-34B") - sgl.set_default_backend(runtime) - - # Run a single request - print("\n========== single ==========\n") - single() - - # Stream output - print("\n========== stream ==========\n") - stream() - - # Run a batch of requests - print("\n========== batch ==========\n") - batch() - - runtime.shutdown() diff --git a/examples/usage/async_io.py b/examples/runtime/async_io_api.py similarity index 100% rename from examples/usage/async_io.py rename to examples/runtime/async_io_api.py diff --git a/examples/usage/llava/http_llama3_llava_test.py b/examples/runtime/llava_onevision/http_llama3_llava_test.py similarity index 94% rename from examples/usage/llava/http_llama3_llava_test.py rename to examples/runtime/llava_onevision/http_llama3_llava_test.py index 813a26af53..a019e214d6 100644 --- a/examples/usage/llava/http_llama3_llava_test.py +++ b/examples/runtime/llava_onevision/http_llama3_llava_test.py @@ -4,7 +4,7 @@ # Installing latest sglang. # Endpoint Service CLI: -# python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --tokenizer-path lmms-lab/llama3-llava-next-8b-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4 +python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 python3 http_llama3_llava_test.py @@ -16,7 +16,6 @@ import asyncio import copy import json -import time import aiohttp import requests diff --git a/examples/usage/llava/http_llava_onevision_test.py b/examples/runtime/llava_onevision/http_llava_onevision_test.py similarity index 96% rename from examples/usage/llava/http_llava_onevision_test.py rename to examples/runtime/llava_onevision/http_llava_onevision_test.py index c32d529819..40dc27ec20 100644 --- a/examples/usage/llava/http_llava_onevision_test.py +++ b/examples/runtime/llava_onevision/http_llava_onevision_test.py @@ -1,3 +1,11 @@ +""" +Usage: + +python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384 + +python3 http_llava_onevision_test.py +""" + import base64 import io import os @@ -74,7 +82,6 @@ def video_stream_request_test(client, video_path): print("------------------------Video Stream Request Test----------------------") messages = prepare_video_messages(video_path) - start_time = time.time() video_request = client.chat.completions.create( model="default", messages=messages, diff --git a/examples/usage/llava/http_qwen_llava_test.py b/examples/runtime/llava_onevision/http_qwen_llava_test.py similarity index 95% rename from examples/usage/llava/http_qwen_llava_test.py rename to examples/runtime/llava_onevision/http_qwen_llava_test.py index 1c29658c60..dca56e7a33 100644 --- a/examples/usage/llava/http_qwen_llava_test.py +++ b/examples/runtime/llava_onevision/http_qwen_llava_test.py @@ -4,7 +4,7 @@ # Installing latest sglang. # Endpoint Service CLI: -# python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --tokenizer-path lmms-lab/llavanext-qwen-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4 +python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 python3 http_qwen_llava_test.py @@ -16,7 +16,6 @@ import asyncio import copy import json -import time import aiohttp import requests diff --git a/examples/usage/openai_batch_chat.py b/examples/runtime/openai_batch_chat.py similarity index 100% rename from examples/usage/openai_batch_chat.py rename to examples/runtime/openai_batch_chat.py diff --git a/examples/usage/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py similarity index 100% rename from examples/usage/openai_batch_complete.py rename to examples/runtime/openai_batch_complete.py diff --git a/examples/usage/llava/srt_llava_next_test.py b/examples/usage/llava/srt_llava_next_test.py deleted file mode 100644 index 0f9621648a..0000000000 --- a/examples/usage/llava/srt_llava_next_test.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Usage: python3 srt_example_llava.py -""" - -from PIL import ImageFile - -import sglang as sgl -from sglang.lang.chat_template import get_chat_template -from sglang.srt.utils import load_image - -ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow loading of truncated images - - -@sgl.function -def image_qa(s, image, question): - s += sgl.user(sgl.image(image) + question) - s += sgl.assistant(sgl.gen("answer")) - - -def single(): - image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg" - pil_image, _ = load_image(image_url) - state = image_qa.run(image=pil_image, question="What is this?", max_new_tokens=512) - print(state["answer"], "\n") - - -def stream(): - image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg" - pil_image, _ = load_image(image_url) - state = image_qa.run( - image=pil_image, - question="Please generate short caption for this image.", - max_new_tokens=512, - temperature=0, - stream=True, - ) - - for out in state.text_iter("answer"): - print(out, end="", flush=True) - print() - - -def batch(): - image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg" - pil_image, _ = load_image(image_url) - states = image_qa.run_batch( - [ - {"image": pil_image, "question": "What is this?"}, - {"image": pil_image, "question": "What is this?"}, - ], - max_new_tokens=512, - ) - for s in states: - print(s["answer"], "\n") - - -if __name__ == "__main__": - import multiprocessing as mp - - mp.set_start_method("spawn", force=True) - runtime = sgl.Runtime( - model_path="lmms-lab/llama3-llava-next-8b", - tokenizer_path="lmms-lab/llama3-llava-next-8b-tokenizer", - ) - runtime.endpoint.chat_template = get_chat_template("llama-3-instruct") - # runtime = sgl.Runtime( - # model_path="lmms-lab/llava-next-72b", - # tokenizer_path="lmms-lab/llavanext-qwen-tokenizer", - # ) - # runtime.endpoint.chat_template = get_chat_template("chatml-llava") - sgl.set_default_backend(runtime) - print(f"chat template: {runtime.endpoint.chat_template.name}") - - # Or you can use API models - # sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview")) - # sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision")) - - # Run a single request - print("\n========== single ==========\n") - single() - - # Stream output - print("\n========== stream ==========\n") - stream() - - # Run a batch of requests - print("\n========== batch ==========\n") - batch() - - runtime.shutdown() diff --git a/examples/usage/rag_using_parea/max-tokens-fixed-rag-trace.png b/examples/usage/rag_using_parea/max-tokens-fixed-rag-trace.png deleted file mode 100644 index 2ea09fdc60209c707655451a589f2a904b6eb25b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 134888 zcmeFZXIN8B*ETGmhy_8hAyTBOG^K=&ARxW>4pIYz&_gexfTBo~7K(J~y_bOW5?ZJU zO?pRaXn}9*bwAI2J@3PH`{(=h9xum{WV834*|TP?S+nLm*X;L73X;TEA6&g~;R3O= zl(@=;3zx+&T)_7uA_UH~h2N*XZ~;#jEGDKTEha{zYHG(Mv3$f&AzuYb$E1ny<>>{+LHS0}%RG%E zw7gZbDD!dXVHd8R7%UXD*HeSh=bT@xMVy6IMtfP`)(uey>$rZDXukxa(e3qlJ zrg;MNj#ij7e(O-nZeXz_+v536rus)4T*dAm4#IfDb5Wem@g?P-3a-<7!D9_|zG6f% z!C*@w6RNANq3O1#*lcy^LtUd$|4;I!?dtp+vXKZx;=IcvzrP)gdFc>#7 z_EL%{%`2IfdHHJnLcS-E9#_*S1&^WiZE>4OVTN#dhr z<24n0r#bDGz3}I5nY77-WhVi>tzo7mZ7wf=;VE!TbOA3AeBmN+ga>>c0G|sN2*Lv{ zTn2t`0UzePrsbq1 zFUM~Jg|Hc$LSLJ)xkBuI6}cej$`2eu%$$sATp>2Lj{L4dw7>7*2abQ8W~Zh3eT$Q| z5UrNH5{($t!HfpP_KfWrt?*SE8X7?dQ*(Y5@mK#U4*VxXYw6@<$Is3VgTdHfoNQ1B z3-;%He0=QBIM_KjSb;lO9o=l5j9po69qIn6H*+*`0NXi%p|&)?>NS21b#@Y> zrTz7w|Ni@{oo24!e?Q6A@n2#A0}qDCEe?hN-2*&B z7{v8l@b~@y>&m~M__vZ;|1SBQ7xe7!MgMl`|17HRXyzaWg#gcV68^Vp{#E$z7yngI zko}kFf9s0B`uX=+preJa3bOyV(uA*$y3ER5xFB*tT3l4k6>n|gQjD59MZ?xTt?HVZ z8r0WDnn;zKH^p^G=_5&RDwC2b(WfcjPD`&L4&BhJR;@nl@z{qOyTry$yd!owq)r^G za-1Fa@WJ*_CLT{>;~!_C7Cj%|zDO!^0q;M3(cHQ!{^{*;GutEI3;0Ad|LIG^*Y`0K z{vU2464$}QPYK)aD>nJhwGq*Xh(tR6=f(k7Y4a}N^-FnJjy?ZRQv9lhhNhG6KQ;e% zbHV^2EYpNa*Y5vG4pP$jhkxk13w~0}A|hM^R5~u#|6?Zu%_9nOxbYu*7igYH+8r92 z2VfFbfq*~h;-Wa;t@gJr-{pE8{<{1x;)URtO%y}n~em$yXD39=hpYOyrJnb$^4{O*yV+UHzMNq zf*x>xI1nd$|9W!bP7e(B^rqv(HvZRC6+wkV@wk@$``dNjnP6ND`aPw%BR_DxRh z2S@t#`Px0WzkUj~c$0!>eDLv{-ugLCWNEk5GhsCG($m&vv_FWIl+Hb}?M+Jj=FQ_6 zceHy*854Qyi@Xbq!>Fr-5I0zlsP1;ZUZX9CCZ`ykiN+FLxTIKhQZ?fIA(W_zhrFIn zLrQ*WJOQ)OD$(lYGr8X}Veibd;pS-UlVs2^;WlSGjw)C_%zs2L=TAvGWbTlZxm~-y zx#MIrrg@Gr`!!s$WaP13ix3~mW2C?zOgc=IW)!c#y%4oV@P|8z1gY_bd)sV3jMu;v zPxVS(6`S=w0;AHs3|!zi@$-y{X7Ez|+K^c5=QD{Lhg9y_&Jv%g^usfy*erVE%dk6% z*&HO-5ZhMjV->KZ1L65{_iw^4Kjmu{gv9eZ>kQ{=QzVxQ)bfB{eY_ve1hwsm=Zg?B zoNf7J&9i*8x9YIJ#>eNhRJew&(giGyj20dz5un{^H|+?Q&O<{FFfHULi;lycMck*I-A7h~-@*mm9QYg;K4xmaboR9L9P#q> zOjDWvP;5P7%jZ64`OVhHRgG#C%T`*k+;XogNc=sm^~ec1I>+61#fOnoJV(<@{b2!* zK@MXcyexF@EY~C3Kjnom!O?z%JVg>Bcu%#l=-Ij5YjDv$&S+_M$ROh83*6&t_jlF%%=PCidn;O>J< zk7UIbW8#D@2c!(xj>9qF-kJ7k8)!KF`*iiyyWy<58jJas&r&UmXnT6EUzGp4C&@Y= z_{cxaVP59e&_x!m%KU=I=2~sbTnS8t91gp7V};AqPc-XuN3@wWvT2U`{AzAREjpph z<+Syd!MztI7i(Z)6k9lr_UIi}?Ff=)6ez(z|tcN`+y7?~n zZj?Fl#V*!iFbAQ!H<`^ml%q*Tao<_aZii}cH3%+|WT|F{m^$2^FGC0wnY4MQ?7f?N zX}q0B@pHYxVY~Ug5d=1DGnO{>bu-Ozq<75$SHDbVrTFfWbyE!a`7SvgpG?kt>D{$^ z^x%Os4x3TMV)s3rj&O$H-J|TxT)jFjDZfX=--h#5Wsk5FkIN==nfwX8*`am%n%VU# zEmety_gynjF)A5y%Y|C%LAi9i82Dpl+WreF3*E8J%5mI~C;pdl(+BMyq?ZOVWa;J& z^6B$*%EN9pNp|GMmaNMLI>#%Z+-mP2d0EuHeN9LYzv3F8aixdvmI`KiL+@Hvwf^dr zE-#c^-zN&5Lyt>Qlq(N%sW{!^J^L!NEH#$lWmYn63hVkXhQYw3g&ObeW@#|i{xZ33 z{^l|&Gs2k8`}$U(`b)}ba$V6d+IQ&&6FM_wxUE-u-1n^#^FzxaXxX*VqP4l$opBQxt-qXPE6T~R4}8w+18tPF14rq;5zr+$Rp>KjQsX$|d^=Y;0Ukej3q=uFD0UYV{B zy}i&y$74Gl8Ov!tmT}J~Nl=|1wlY&d-U_3rrBlsRjN;od?`T^`^K#~LkwKdk#UK}) z8&j&bn?C!?J7&1WujVS3RLjMJq|HyCBds%#l~+<3jK5#p?#xj)oq>|Yca8X9DPsla zHQ~5yZFl>b)?Wg+HgW-OWX{lZFK&K&huuU1D;;c}x>B8^mdAoJKVggTPFiB`6Bsug zF2^0vE%%qyuozT}xO?`?httc?e}cb*;f?z&?1+<`gB)JJRh(zu1*TU@qI4Ql2W=&L10%!yTi7l()Nj z5uN3E7_-s7n~ab7QRZhYpQ)#WrBbH+_!)UYHqE3S35T_knvUJ60-tgdWMZa$NsI7|J<4#Us z_p9nzv1sG_x~ZCR%2;BqSV4ZYk}PV%8Uld~Rwm+7suzn+Zi&QogKac(xw zB)NC#1af_Cynm-Rp(u0emSUSh!5K1+r_6Mg>0{Umw?Rb{SiaV7dK$0TriHeDs^R2t zXKX7O$D~G(5@KZ8ruc$**xT}_<4Ar&vBx5tc9BgL3kP2B)h|zFQkvgGg#3@N(W@f` z9g^$9Q7qae?V*iR@bauB+FheOZ+$UAiPAoL~OIVaAANMz0^|=`g|!#YoQQ z`Lr{^wh6Hd~|A3hcPqdzZ?8AwbmO*EN#wiKu zI(sAY5jUv2j+|G`2HT6a2V~i0HYXW9m_(#QY3g@ZrzCGs{rplOXT+<+MGd&XiUe5# z$vE}BeaxOIdT<#ObP!u)&viMTQQnH|VHVwDZ$k9p&fOS_)**4?QdeC>%{uO@G#y3I z@#!)3`l%JojJoQ`eAiiQFDE3SD>^SST{e2`$6%H!`D&WWD5knp+GHY-Xdv`0BS%GwZyod|^-3 zWdxE=r6V6Fr}5xvu*PzjaV_do^SP|W>>g$$A>Rzm-i(tdsQ|S4l%%`qg0|dQ;?clm zf+!XPE6y5xU0C|lJM+{i`Ed@%MC;HAZlZ(3dlSg2{Wn4Z(d_zB)q7R~IoEO=btv6I#pPewA{%w1#AK5Y%N;?A^{QPvxC%~V>ok^} z-Iv472FCS9xGi&marxbNeVKn@-NS;J-;l?j#$k7 z(W=>wQ55ujrbz*=>vV6?(B&Ru+-lJ2uyvtxAC^FzpwAG#>USz^ZKnpGAzt5h6}BbH zQ|9DJg*#1gu-DT>FdpG9e=l2sJ>as#9gG%k`uT4U-4lYCe={)MUhug6i0UWKNC4rz z93Dr##+{?lmF$0+!O!jc^k7HCw)zw_P3gz?ZI!WNju%)xT}i>6(sxm_lAk*Z$h@Q6 zH)~nHC|n2*{Lfj!_=?bzjTW2bKu%RT#khZ_r`t}Tw;@TV#TLC(@cs-xIYqO5 zYDapB4|9xNC5gK$6DUIEk$jx2yVK^Ak$n9Ks|;!Vy0D|R+_CZx-x+yRW0~T#Q|Bk^ zJW|NHNvH&$?WGmQO2&D8^D0;t5{GOkb3#DSNm?>*D(mPg2Nv(EiY5j(m>tbBt7oTKf}$$+YA}Z(95aw9UH_CKO64Ua z-7m)tVXGL4HtZsg=q~BKNK`haM;r<6dsRX^q|JG)EKMH%>O=Hn^T4K+_~GeHZ>GJP zRC22bCi$>;jH{!{Z#?q3t(Kx&GDWH6ZL?L+!_U z=Xv){rzfJ&@y<&1Wa#F~OzYshaU`|kh+JJj^<5m>7q{e(Za6b)F4qxL^CV6Z=n`K} zTWgssf2;dCpRLaRpf%?8PJ?efHP==WUEpUa?uZ`+n1LS=Jq|ky@;eFW*PQG6t3%e6 zAc?%OY3-pDZ*O}t*B_KEbC{{ncsv(2CCI{ zk1xDEHRw(hQ2h`i?9Sal8OJb)pQ2dOf0UHbsh%U_xO4k$;u>=_d$b>paMai02`b=h zXFkX$Gidqw!4it*O%%%9qm;Lp z-g)%7e4rxSNczo1?=fXjeFR&k1c4lyCz((;RT69Mz^OB6=}ks(xC+s~$EH^*D^R|( zJ>Lc5z4?qLg%cdw#ywpP-8>Z;KCyH#YCG(YKMi!kl(Fco!8S+iy-h*0?6Gn~eUH&{ zi{HXb#x>T_Z!>GipK7o`d+Cr@hd!D^ywwaJ&Ze(7qNP%Q{S>NRIYXjF9OkjdM30n0`rATYQvR>jD9p5dY3ik|K!aN z!CgBr=)ah|IaR~XX>n@Iq+W0uNBo*yzgloQx2IR~GgV2`gTDPep}B`it)9aUbM17B z#O-_^ZmjxWa;LgHFYx2Qpt`1S=(_f<-y6&nk#XDl|6S6T{m??exnjAFc`~81RsJL)g=3k zy`&hwwfvZOr$nteLIH~&#?kMXaUrGeJy8)i$kRcZ%fcFqrblYbQMct2LIzcypiTV)o>GY3DN>b$fwCk4Ojy*&E_DV-xBz^M== zSVf$%@%-$$wAm@yj4MZ;>-$>{JHztiM>vv&Hpb5-) ze5)_*ey&ZeSzr*c0r;+*41JdPJdzisgZzSr&T| zuazE(Bxr6j@lSG7sOrX`C??{bM68_2JE6%1PY%y!TvpzbF|B*^naiJ^nR7VTTK7-4 zT$Xw$+~h06k28Y$IQvzdTFONa>?Zsz zIYOj}jf|-$)r^}!aq|j7B|XmCIhj8fN&lIWrxS$9@#?jbO*zxAe+^%sI6TWp6oQD% zXMb6vca}H97mVXhM@mG2Vw2BK>Us#T=)o_$aIxlGdpE?Fx7weANhTp*DpH8$NIx*x zDVz0o%m+_}8SZU>;ia~dxnI(rVK>SQ4J7+>8_l{jM&%Fzn>?Wk2?C=FYW@y-`n^7q zUddah(t$4f)pJBD2_WHhBrg=Nziy%WTe{~vhg1BNJIqlPvwOyPkZQHy)#ol3uW)P4 ztqU06bHeTTT!&4`j*Q{o5LkfK)+mfLpub-VPMfNj8ngapAUW}+A*2Yzo`+{o%(Ch> zXNNM%*e}RMP+K6m>y>`I*~o^4pw?bb5GG71hr^8_d}_fJt8Y!bM_>~cUSK1oIw7x3 zz33~;Kdvr#DlJ|LUWl2klc;t+6qL&O=|b7h(&O7=?@76Nno$!k;72YH}!1MB)IYWbfk zvTPAYxhO7cwW6}BkL0SEd^yNCR=A~PGa@F$i-d}o27%*H=Jy$BJR!v`#%J0Ur3}=o zN@6k_NeFOplgmVl}WbRJ&2C+ApbPOM& zikhgote)~CRr}4^xJ}%bJ~z!rxKOU=Sj~^jrF4AH)2m9UJF9bCWb4n6=2l9H9eE(` z5Lw8dS@vX(Vgx!W>?9k_X2!`2#2^c!*U8#&m_Mb7>=}&CH2Ri5 z1-SKP5A0eAa$%)R_jAdnD59a7P`Avvx8KD37ICr9c>brJjiT{756xYHuOk(-64zuI zb`qW#*5{r;^1tZR>N5#ySTS0TcVMtRm|{f%$oS|LjxJNG3d>>s_Qt-3d^*Jvc|Y>Y zC#+Z2Mpv8obh6>XRS_}52-qgt8>5&pq&zT8a{oaVfgJRQuqs zE_s;tTyoBI
-To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE. +To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE. ``` export SGLANG_USE_MODELSCOPE=true ``` Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server ``` SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000 -``` +``` +
#### Run Llama 3.1 405B +
```bash # Run 405B (fp8) on a single node @@ -272,6 +274,8 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph ``` +
+ ### Benchmark Performance - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`. @@ -407,7 +411,7 @@ def tip_suggestion(s): s += "In summary" + sgl.gen("summary") ``` -#### Multi Modality +#### Multi-Modality Use `sgl.image` to pass an image as input. ```python @@ -461,7 +465,7 @@ def character_gen(s, name): s += sgl.gen("json_output", max_tokens=256, regex=character_regex) ``` -See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models. +See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models. #### Batching Use `run_batch` to run a batch of requests with continuous batching. @@ -523,7 +527,6 @@ def chat_example(s): - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability. - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`. - ## Benchmark And Performance ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg) ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg) From e61d13acdf3193606c3bc57fb59f0de33eab7490 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sat, 24 Aug 2024 18:35:55 -0700 Subject: [PATCH 074/118] [CI] Fix the problem of hf runner too slow (#1202) --- python/sglang/test/runners.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index e519c92829..4fc1f0f259 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -86,7 +86,6 @@ def start_model_process( self.tokenizer = AutoTokenizer.from_pretrained( model_path, torch_dtype=torch_dtype, - trust_remote_code=True, ) self.is_generation_model = is_generation_model @@ -96,7 +95,6 @@ def start_model_process( model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, - trust_remote_code=True, ).cuda() else: from sentence_transformers import SentenceTransformer From 1cb4da5c5f1fbaafa5c48b052b1f05abedd97fe5 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sat, 24 Aug 2024 21:43:03 -0700 Subject: [PATCH 075/118] [Fix] the issue of random order when input is a list (#1199) --- .../sglang/srt/managers/tokenizer_manager.py | 14 +++++------ python/sglang/srt/server.py | 4 ++-- python/sglang/test/runners.py | 2 +- test/srt/models/test_embedding_models.py | 23 +++++++++++-------- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 8420f20dd1..8f67005755 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -437,13 +437,13 @@ async def _handle_batch_request( is_stream = hasattr(obj, "stream") and obj.stream tasks = [asyncio.create_task(gen.__anext__()) for gen in generators] - output_list = [] + output_list = [None] * len(tasks) while tasks: done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) for task in done: - gen_index = tasks.index(task) + cur_index = tasks.index(task) try: result = task.result() @@ -451,14 +451,14 @@ async def _handle_batch_request( if is_stream: yield result else: - output_list.append(result) + output_list[result["index"]] = result - tasks[gen_index] = asyncio.create_task( - generators[gen_index].__anext__() + tasks[cur_index] = asyncio.create_task( + generators[cur_index].__anext__() ) except StopAsyncIteration: - del generators[gen_index] - del tasks[gen_index] + del generators[cur_index] + del tasks[cur_index] if not is_stream: yield output_list diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 3ec5cd633f..241fabf6d1 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -591,7 +591,7 @@ async def async_generate( def generate( self, - prompt: str, + prompt: Union[str, List[str]], sampling_params: Optional[Dict] = None, return_logprob: Optional[Union[List[bool], bool]] = False, logprob_start_len: Optional[Union[List[int], int]] = None, @@ -612,7 +612,7 @@ def generate( def encode( self, - prompt: str, + prompt: Union[str, List[str]], ): json_data = { "text": prompt, diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 4fc1f0f259..9f18a91f73 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -28,10 +28,10 @@ DEFAULT_PROMPTS = [ # the output of gemma-2-2b from SRT is unstable on the commented prompt # "The capital of France is", + "Apple is red. Banana is Yellow. " * 800 + "Apple is", "The capital of the United Kindom is", "Today is a sunny day and I like", "AI is a field of computer science focused on", - "Apple is red. Banana is Yellow. " * 800 + "Apple is", ] dirpath = os.path.dirname(__file__) diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py index 44fed2ad0b..cc830f6257 100644 --- a/test/srt/models/test_embedding_models.py +++ b/test/srt/models/test_embedding_models.py @@ -20,7 +20,7 @@ from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner from sglang.test.test_utils import get_similarities -MODELS = [("intfloat/e5-mistral-7b-instruct", 1)] +MODELS = [("intfloat/e5-mistral-7b-instruct", 1, 0.2)] TORCH_DTYPES = [torch.float16] @@ -32,6 +32,7 @@ def assert_close_prefill_logits( model_path, tp_size, torch_dtype, + long_context_tolerance, ) -> None: with HFRunner( model_path, torch_dtype=torch_dtype, is_generation_model=False @@ -52,20 +53,22 @@ def assert_close_prefill_logits( hf_logits = torch.Tensor(hf_outputs.embed_logits[i]) srt_logits = torch.Tensor(srt_outputs.embed_logits[i]) - similarities = torch.tensor(get_similarities(hf_logits, srt_logits)) - print("max similarity diff", torch.max(abs(similarities - 1))) + similarity = torch.tensor(get_similarities(hf_logits, srt_logits)) + print("similarity diff", abs(similarity - 1)) - if hf_logits.shape[0] <= 100: - tolerance = 1e-2 - assert torch.all( - abs(similarities - 1) < tolerance - ), "embeddings are not all close" + if len(prompts[i]) <= 1000: + tolerance = 1e-5 + else: + tolerance = long_context_tolerance + assert torch.all( + abs(similarity - 1) < tolerance + ), "embeddings are not all close" def test_prefill_logits(self): - for model, tp_size in MODELS: + for model, tp_size, long_context_tolerance in MODELS: for torch_dtype in TORCH_DTYPES: self.assert_close_prefill_logits( - DEFAULT_PROMPTS, model, tp_size, torch_dtype + DEFAULT_PROMPTS, model, tp_size, torch_dtype, long_context_tolerance ) From bc4c7a35457b0a1cb4e83b9f80a01f2cbee9f0e9 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 25 Aug 2024 10:27:02 -0700 Subject: [PATCH 076/118] Relax the assert in moe throughput test to fix the flaky CI (#1207) --- test/srt/test_moe_serving_throughput.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index bbcd512276..3cdf724f31 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -73,7 +73,7 @@ def test_default(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] > 930 + assert res["output_throughput"] > 910 def test_default_without_radix_cache(self): res = self.run_test( @@ -84,7 +84,7 @@ def test_default_without_radix_cache(self): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] > 930 + assert res["output_throughput"] > 910 def test_default_without_chunked_prefill(self): res = self.run_test( From 66e7dcaf7008d2ffe892044a21513a6e06424d1a Mon Sep 17 00:00:00 2001 From: Kaichen Zhang - NTU Date: Mon, 26 Aug 2024 01:28:23 +0800 Subject: [PATCH 077/118] [Fix] Fixing the multi-images error for llava-onevision (#1205) --- .../http_llava_onevision_test.py | 46 +++++++++++++++++++ .../sglang/srt/managers/tokenizer_manager.py | 4 +- test/srt/test_vision_openai_server.py | 42 +++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/examples/runtime/llava_onevision/http_llava_onevision_test.py b/examples/runtime/llava_onevision/http_llava_onevision_test.py index 40dc27ec20..41d60b12af 100644 --- a/examples/runtime/llava_onevision/http_llava_onevision_test.py +++ b/examples/runtime/llava_onevision/http_llava_onevision_test.py @@ -78,6 +78,51 @@ def image_stream_request_test(client): print("-" * 30) +def multi_image_stream_request_test(client): + print( + "----------------------Multi-Images Stream Request Test----------------------" + ) + stream_request = client.chat.completions.create( + model="default", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png" + }, + }, + { + "type": "text", + "text": "I have shown you two images. Please describe the two images to me.", + }, + ], + }, + ], + temperature=0.7, + max_tokens=1024, + stream=True, + ) + stream_response = "" + + for chunk in stream_request: + if chunk.choices[0].delta.content is not None: + content = chunk.choices[0].delta.content + stream_response += content + sys.stdout.write(content) + sys.stdout.flush() + + print("-" * 30) + + def video_stream_request_test(client, video_path): print("------------------------Video Stream Request Test----------------------") messages = prepare_video_messages(video_path) @@ -209,6 +254,7 @@ def main(): client = create_openai_client("http://127.0.0.1:30000/v1") image_stream_request_test(client) + multi_image_stream_request_test(client) video_stream_request_test(client, video_path) image_speed_test(client) video_speed_test(client, video_path) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 8f67005755..5cc060be1a 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -744,7 +744,9 @@ def get_pixel_values( image, tuple(int(x * 255) for x in processor.image_processor.image_mean), ) - pixel_values = processor.image_processor(image)["pixel_values"][0] + pixel_values = processor.image_processor(image.convert("RGB"))[ + "pixel_values" + ][0] elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: pixel_values = process_anyres_image( image, processor.image_processor, image_grid_pinpoints diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0a477a92ae..0f136fe6e5 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -74,6 +74,48 @@ def test_chat_completion(self): assert response.usage.completion_tokens > 0 assert response.usage.total_tokens > 0 + def test_mult_images_chat_completion(self): + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + + response = client.chat.completions.create( + model="default", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png" + }, + }, + { + "type": "text", + "text": "I have shown you two images. Please describe the two images to me.", + }, + ], + }, + ], + temperature=0, + ) + + assert response.choices[0].message.role == "assistant" + text = response.choices[0].message.content + assert isinstance(text, str) + assert "man" in text or "cab" in text, text + assert "logo" in text, text + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + def prepare_video_messages(self, video_path): max_frames_num = 32 vr = VideoReader(video_path, ctx=cpu(0)) From 30b4f771b0c515c18179f3e1ee0b4662b2606a95 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Mon, 26 Aug 2024 01:29:12 +0800 Subject: [PATCH 078/118] Support Alibaba-NLP/gte-Qwen2-7B-instruct embedding Model (#1186) Co-authored-by: Ying Sheng --- .github/workflows/accuracy-test.yml | 2 +- .github/workflows/unit-test.yml | 2 +- README.md | 15 ++++ .../sglang/srt/managers/tokenizer_manager.py | 5 +- python/sglang/srt/managers/tp_worker.py | 1 + .../sglang/srt/model_executor/model_runner.py | 17 ++++- python/sglang/srt/models/llama_embedding.py | 4 + python/sglang/srt/models/qwen2.py | 12 ++- python/sglang/srt/server.py | 3 + python/sglang/srt/server_args.py | 11 +++ python/sglang/srt/utils.py | 9 ++- python/sglang/test/runners.py | 32 ++++---- test/srt/models/test_embedding_models.py | 28 ++++--- test/srt/models/test_generation_models.py | 73 +++++++++++++++++-- test/srt/run_suite.py | 8 +- 15 files changed, 167 insertions(+), 55 deletions(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 374f0d2856..16bb584f4a 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -43,4 +43,4 @@ jobs: run: | cd test/srt python3 test_eval_accuracy_large.py - timeout-minutes: 10 + timeout-minutes: 20 diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 3422cde40d..607cb865db 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -41,7 +41,7 @@ jobs: run: | cd test/srt python3 run_suite.py --suite minimal - timeout-minutes: 18 + timeout-minutes: 20 - name: Test Frontend Language run: | diff --git a/README.md b/README.md index 2fc91e7858..651108f9e2 100644 --- a/README.md +++ b/README.md @@ -187,6 +187,13 @@ response = client.chat.completions.create( max_tokens=64, ) print(response) + +# Text embedding +response = client.embeddings.create( + model="default", + input="How are you today", +) +print(response) ``` It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). @@ -223,6 +230,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ### Supported Models +**Generative Models** + - Llama / Llama 2 / Llama 3 / Llama 3.1 - Mistral / Mixtral / Mistral NeMo - Gemma / Gemma 2 @@ -243,6 +252,12 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - ChatGLM - InternLM 2 +**Embedding Models** + +- e5-mistral +- gte-Qwen2 + - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding` + Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md). #### Use Models From ModelScope diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 5cc060be1a..4008a093ad 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -94,7 +94,10 @@ def __init__( trust_remote_code=server_args.trust_remote_code, model_overide_args=model_overide_args, ) - self.is_generation = is_generation_model(self.hf_config.architectures) + + self.is_generation = is_generation_model( + self.hf_config.architectures, self.server_args.is_embedding + ) if server_args.context_length is not None: self.context_len = server_args.context_length diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index fa79f84921..19edc23b83 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -94,6 +94,7 @@ def __init__( context_length=server_args.context_length, model_overide_args=model_overide_args, ) + self.model_runner = ModelRunner( model_config=self.model_config, mem_fraction_static=server_args.mem_fraction_static, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 661660281f..6b48d1f90e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -204,7 +204,7 @@ def load_model(self): else None ) self.is_generation = is_generation_model( - self.model_config.hf_config.architectures + self.model_config.hf_config.architectures, self.server_args.is_embedding ) logger.info( @@ -522,9 +522,18 @@ def forward_extend(self, batch: ScheduleBatch): batch, forward_mode=ForwardMode.EXTEND, ) - return self.model.forward( - batch.input_ids, input_metadata.positions, input_metadata - ) + if self.is_generation: + return self.model.forward( + batch.input_ids, input_metadata.positions, input_metadata + ) + else: + # Only embedding models have get_embedding parameter + return self.model.forward( + batch.input_ids, + input_metadata.positions, + input_metadata, + get_embedding=True, + ) @torch.inference_mode() def forward_extend_multi_modal(self, batch: ScheduleBatch): diff --git a/python/sglang/srt/models/llama_embedding.py b/python/sglang/srt/models/llama_embedding.py index e8e6780472..dfff53cbcd 100644 --- a/python/sglang/srt/models/llama_embedding.py +++ b/python/sglang/srt/models/llama_embedding.py @@ -29,7 +29,11 @@ def forward( positions: torch.Tensor, input_metadata: InputMetadata, input_embeds: torch.Tensor = None, + get_embedding: bool = True, ) -> EmbeddingPoolerOutput: + assert ( + get_embedding + ), "LlamaEmbeddingModel / MistralModel is only used for embedding" hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) return self.pooler(hidden_states, input_metadata) diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index d1295bd8cc..fcf083e1b5 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -38,6 +38,7 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor +from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -275,6 +276,7 @@ def __init__( self.model = Qwen2Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) @torch.no_grad() def forward( @@ -283,11 +285,15 @@ def forward( positions: torch.Tensor, input_metadata: InputMetadata, input_embeds: torch.Tensor = None, + get_embedding: bool = False, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( - input_ids, hidden_states, self.lm_head.weight, input_metadata - ) + if not get_embedding: + return self.logits_processor( + input_ids, hidden_states, self.lm_head.weight, input_metadata + ) + else: + return self.pooler(hidden_states, input_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 241fabf6d1..813f2de782 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -333,11 +333,13 @@ def launch_server( start_process = start_controller_process_single else: start_process = start_controller_process_multi + proc_controller = mp.Process( target=start_process, args=(server_args, port_args, pipe_controller_writer, model_overide_args), ) proc_controller.start() + proc_detoken = mp.Process( target=start_detokenizer_process, args=( @@ -515,6 +517,7 @@ def __init__( self.pid = None pipe_reader, pipe_writer = mp.Pipe(duplex=False) + proc = mp.Process( target=launch_server, args=(self.server_args, model_overide_args, pipe_writer), diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 870169c6d5..58e24dab8b 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -38,6 +38,7 @@ class ServerArgs: quantization: Optional[str] = None served_model_name: Optional[str] = None chat_template: Optional[str] = None + is_embedding: bool = False # Port host: str = "127.0.0.1" @@ -200,6 +201,11 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", ) + parser.add_argument( + "--is-embedding", + action="store_true", + help="Whether to use a CausalLM as an embedding model.", + ) parser.add_argument( "--context-length", type=int, @@ -458,6 +464,11 @@ def check_server_args(self): assert not ( self.dp_size > 1 and self.node_rank is not None ), "multi-node data parallel is not supported" + if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path: + logger.info( + "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True" + ) + self.trust_remote_code = False if "gemma-2" in self.model_path.lower(): logger.info("When using sliding window in gemma-2, turn on flashinfer.") self.disable_flashinfer = False diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 93c54782a0..102dcb3d87 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -224,13 +224,18 @@ def is_multimodal_model(model): raise ValueError("unrecognized type") -def is_generation_model(model_architectures): +def is_generation_model(model_architectures, is_embedding: bool = False): + # We have two ways to determine whether a model is a generative model. + # 1. Check the model architectue + # 2. check the `is_embedding` server args + if ( "LlamaEmbeddingModel" in model_architectures or "MistralModel" in model_architectures ): return False - return True + else: + return not is_embedding def decode_video_base64(video_base64): diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 9f18a91f73..9a5bd4fd59 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -14,7 +14,7 @@ """ import json -import multiprocessing +import multiprocessing as mp import os from dataclasses import dataclass from typing import List, Union @@ -63,37 +63,35 @@ def __init__( self, model_path, torch_dtype, - is_generation_model, + is_generation, ): - self.in_queue = multiprocessing.Queue() - self.out_queue = multiprocessing.Queue() + self.is_generation = is_generation - self.model_proc = multiprocessing.Process( + self.in_queue = mp.Queue() + self.out_queue = mp.Queue() + + self.model_proc = mp.Process( target=self.start_model_process, args=( self.in_queue, self.out_queue, model_path, torch_dtype, - is_generation_model, ), ) self.model_proc.start() - def start_model_process( - self, in_queue, out_queue, model_path, torch_dtype, is_generation_model - ): + def start_model_process(self, in_queue, out_queue, model_path, torch_dtype): self.tokenizer = AutoTokenizer.from_pretrained( model_path, torch_dtype=torch_dtype, ) - self.is_generation_model = is_generation_model - - if self.is_generation_model: + if self.is_generation: self.model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch_dtype, + trust_remote_code=False, low_cpu_mem_usage=True, ).cuda() else: @@ -107,7 +105,7 @@ def start_model_process( while True: prompts, max_new_tokens = in_queue.get() if prompts is not None: - if self.is_generation_model: + if self.is_generation: output_strs = [] prefill_logprobs = [] for p in prompts: @@ -171,17 +169,19 @@ def __init__( self, model_path, torch_dtype, - is_generation_model, + is_generation, tp_size=1, port=5157, ): - self.is_generation_model = is_generation_model + self.is_generation = is_generation self.runtime = Runtime( model_path=model_path, tp_size=tp_size, dtype=get_dtype_str(torch_dtype), port=port, mem_fraction_static=0.7, + trust_remote_code=False, + is_embedding=not self.is_generation, ) def forward( @@ -189,7 +189,7 @@ def forward( prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS, max_new_tokens=8, ): - if self.is_generation_model: + if self.is_generation: # the return value contains logprobs from prefill output_strs = [] top_input_logprobs = [] diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py index cc830f6257..ecb3e7576e 100644 --- a/test/srt/models/test_embedding_models.py +++ b/test/srt/models/test_embedding_models.py @@ -20,7 +20,10 @@ from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner from sglang.test.test_utils import get_similarities -MODELS = [("intfloat/e5-mistral-7b-instruct", 1, 0.2)] +MODELS = [ + ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5), + ("intfloat/e5-mistral-7b-instruct", 1, 1e-5), +] TORCH_DTYPES = [torch.float16] @@ -32,10 +35,10 @@ def assert_close_prefill_logits( model_path, tp_size, torch_dtype, - long_context_tolerance, + prefill_tolerance, ) -> None: with HFRunner( - model_path, torch_dtype=torch_dtype, is_generation_model=False + model_path, torch_dtype=torch_dtype, is_generation=False ) as hf_runner: hf_outputs = hf_runner.forward(prompts) @@ -43,11 +46,9 @@ def assert_close_prefill_logits( model_path, tp_size=tp_size, torch_dtype=torch_dtype, - is_generation_model=False, + is_generation=False, ) as srt_runner: - srt_outputs = srt_runner.forward( - prompts, - ) + srt_outputs = srt_runner.forward(prompts) for i in range(len(prompts)): hf_logits = torch.Tensor(hf_outputs.embed_logits[i]) @@ -57,18 +58,15 @@ def assert_close_prefill_logits( print("similarity diff", abs(similarity - 1)) if len(prompts[i]) <= 1000: - tolerance = 1e-5 - else: - tolerance = long_context_tolerance - assert torch.all( - abs(similarity - 1) < tolerance - ), "embeddings are not all close" + assert torch.all( + abs(similarity - 1) < prefill_tolerance + ), "embeddings are not all close" def test_prefill_logits(self): - for model, tp_size, long_context_tolerance in MODELS: + for model, tp_size, prefill_tolerance in MODELS: for torch_dtype in TORCH_DTYPES: self.assert_close_prefill_logits( - DEFAULT_PROMPTS, model, tp_size, torch_dtype, long_context_tolerance + DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance ) diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index ba64907eae..7e7e401d27 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -20,12 +20,46 @@ from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner MODELS = [ - ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1, 1.1), - ("google/gemma-2-2b", 1, 3), + ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1, 1.1, 3e-2, 1), + ("google/gemma-2-2b", 1, 3, 3e-2, 1), + ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, None, 6e-2, 1), ] TORCH_DTYPES = [torch.float16] +def lcs(X, Y): + m = len(X) + n = len(Y) + L = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + L[i][j] = 0 + elif X[i - 1] == Y[j - 1]: + L[i][j] = L[i - 1][j - 1] + 1 + else: + L[i][j] = max(L[i - 1][j], L[i][j - 1]) + + return L[m][n] + + +def calculate_rouge_l(output_strs_list1, output_strs_list2): + rouge_l_scores = [] + + for s1, s2 in zip(output_strs_list1, output_strs_list2): + lcs_len = lcs(s1, s2) + precision = lcs_len / len(s1) if len(s1) > 0 else 0 + recall = lcs_len / len(s2) if len(s2) > 0 else 0 + if precision + recall > 0: + fmeasure = (2 * precision * recall) / (precision + recall) + else: + fmeasure = 0.0 + rouge_l_scores.append(fmeasure) + + return rouge_l_scores + + class TestGenerationModels(unittest.TestCase): def assert_close_prefill_logits_and_output_strs( @@ -35,10 +69,14 @@ def assert_close_prefill_logits_and_output_strs( tp_size, torch_dtype, max_new_tokens, + prefill_tolerance, + rouge_threshold, long_context_tolerance, ) -> None: + if model_path == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": + prompts = prompts[:-1] with HFRunner( - model_path, torch_dtype=torch_dtype, is_generation_model=True + model_path, torch_dtype=torch_dtype, is_generation=True ) as hf_runner: hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens) @@ -46,7 +84,7 @@ def assert_close_prefill_logits_and_output_strs( model_path, tp_size=tp_size, torch_dtype=torch_dtype, - is_generation_model=True, + is_generation=True, ) as srt_runner: srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens) @@ -56,17 +94,34 @@ def assert_close_prefill_logits_and_output_strs( print("max_diff", torch.max(abs(hf_logprobs - srt_logprobs))) if hf_logprobs.shape[0] <= 100: - tolerance = 3e-2 assert torch.all( - abs(hf_logprobs - srt_logprobs) < tolerance + abs(hf_logprobs - srt_logprobs) < prefill_tolerance ), "prefill logprobs are not all close" print(hf_outputs.output_strs) print(srt_outputs.output_strs) - assert hf_outputs.output_strs == srt_outputs.output_strs + rouge_l_scores = calculate_rouge_l( + hf_outputs.output_strs, srt_outputs.output_strs + ) + assert all( + score >= rouge_threshold for score in rouge_l_scores + ), f"Not all ROUGE-L scores are greater than {rouge_threshold}" def test_prefill_logits_and_output_strs(self): - for model, tp_size, long_context_tolerance in MODELS: + import multiprocessing as mp + + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + + for ( + model, + tp_size, + long_context_tolerance, + prefill_tolerance, + rouge_threshold, + ) in MODELS: for torch_dtype in TORCH_DTYPES: max_new_tokens = 8 self.assert_close_prefill_logits_and_output_strs( @@ -75,6 +130,8 @@ def test_prefill_logits_and_output_strs(self): tp_size, torch_dtype, max_new_tokens, + prefill_tolerance=prefill_tolerance, + rouge_threshold=rouge_threshold, long_context_tolerance=long_context_tolerance, ) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 8a887912a0..5a11c8ee0f 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -5,6 +5,9 @@ suites = { "minimal": [ + "models/test_embedding_models.py", + "models/test_generation_models.py", + "sampling/penaltylib", "test_chunked_prefill.py", "test_embedding_openai_server.py", "test_eval_accuracy_mini.py", @@ -13,11 +16,8 @@ "test_skip_tokenizer_init.py", "test_torch_compile.py", "test_triton_attn_backend.py", - "test_vision_openai_server.py", "test_update_weights.py", - "models/test_generation_models.py", - "models/test_embedding_models.py", - "sampling/penaltylib", + "test_vision_openai_server.py", ], "sampling/penaltylib": glob.glob( "sampling/penaltylib/**/test_*.py", recursive=True From 902278008a6e5cf0f054c0b6ce4ba0cc64ce7437 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 25 Aug 2024 14:46:34 -0700 Subject: [PATCH 079/118] [Minor] Improve the function organization in TokenizerManager & improve loggers (#1208) --- docs/en/hyperparameter_tuning.md | 2 +- python/sglang/srt/hf_transformers_utils.py | 11 -- .../sglang/srt/managers/controller_multi.py | 7 +- .../sglang/srt/managers/controller_single.py | 14 +- .../srt/managers/detokenizer_manager.py | 20 ++- .../sglang/srt/managers/tokenizer_manager.py | 156 +++++++++--------- python/sglang/srt/managers/tp_worker.py | 14 +- .../sglang/srt/model_executor/model_runner.py | 17 +- python/sglang/srt/openai_api/adapter.py | 4 +- python/sglang/srt/server.py | 8 +- python/sglang/srt/server_args.py | 2 +- python/sglang/srt/utils.py | 16 +- 12 files changed, 137 insertions(+), 134 deletions(-) diff --git a/docs/en/hyperparameter_tuning.md b/docs/en/hyperparameter_tuning.md index 02a0657c3f..f2bf9d55f3 100644 --- a/docs/en/hyperparameter_tuning.md +++ b/docs/en/hyperparameter_tuning.md @@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro When the server is running at full load, look for the following in the log: -```[gpu=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` +```Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` ### Tune Your Request Submission Speed `#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed. diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 525d295439..4f6e3d0715 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -142,17 +142,6 @@ def get_tokenizer( raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") kwargs["use_fast"] = False - if ( - "llama" in tokenizer_name.lower() - and kwargs.get("use_fast", True) - and tokenizer_name != _FAST_LLAMA_TOKENIZER - ): - warnings.warn( - "For some LLaMA V1 models, initializing the fast tokenizer may " - "take a long time. To reduce the initialization time, consider " - f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " - "tokenizer." - ) try: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, diff --git a/python/sglang/srt/managers/controller_multi.py b/python/sglang/srt/managers/controller_multi.py index 38229cd466..d2b10e7fa2 100644 --- a/python/sglang/srt/managers/controller_multi.py +++ b/python/sglang/srt/managers/controller_multi.py @@ -35,7 +35,7 @@ TokenizedGenerateReqInput, ) from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import kill_parent_process +from sglang.srt.utils import configure_logger, kill_parent_process from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -193,10 +193,7 @@ def start_controller_process( ): """Start a controller process.""" - logging.basicConfig( - level=getattr(logging, server_args.log_level.upper()), - format="%(message)s", - ) + configure_logger(server_args) try: controller = ControllerMulti(server_args, port_args, model_overide_args) diff --git a/python/sglang/srt/managers/controller_single.py b/python/sglang/srt/managers/controller_single.py index 422db943f6..4a16a6f6e4 100644 --- a/python/sglang/srt/managers/controller_single.py +++ b/python/sglang/srt/managers/controller_single.py @@ -27,7 +27,7 @@ launch_tp_servers, ) from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import kill_parent_process +from sglang.srt.utils import configure_logger, kill_parent_process from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) @@ -52,7 +52,7 @@ def __init__( self.dp_worker_id = dp_worker_id self.mp_queue = mp_queue - # Init communication + # Init inter-process communication context = zmq.Context(2) if not self.is_dp_worker: @@ -133,11 +133,11 @@ def start_controller_process( queue: multiprocessing.connection.Connection = None, ): """Start a controller process.""" - - logging.basicConfig( - level=getattr(logging, server_args.log_level.upper()), - format="%(message)s", - ) + if is_data_parallel_worker: + logger_prefix = f" DP{dp_worker_id} TP0" + else: + logger_prefix = " TP0" + configure_logger(server_args, prefix=logger_prefix) if not is_data_parallel_worker: tp_size_local = server_args.tp_size // server_args.nnodes diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index 9a4306372b..cd5f63125c 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -56,6 +56,7 @@ def __init__( server_args: ServerArgs, port_args: PortArgs, ): + # Init inter-process communication context = zmq.asyncio.Context(2) self.recv_from_router = context.socket(zmq.PULL) self.recv_from_router.bind(f"tcp://127.0.0.1:{port_args.detokenizer_port}") @@ -75,10 +76,13 @@ def __init__( self.decode_status = {} async def handle_loop(self): + """The event loop that handles requests""" + while True: - recv_obj: BatchTokenIDOut = await self.recv_from_router.recv_pyobj() + recv_obj = await self.recv_from_router.recv_pyobj() if isinstance(recv_obj, BatchEmbeddingOut): + # If it is embedding model, no detokenization is needed. self.send_to_tokenizer.send_pyobj( BatchEmbeddingOut( rids=recv_obj.rids, @@ -88,19 +92,18 @@ async def handle_loop(self): ) ) continue - - if isinstance(recv_obj, UpdateWeightReqOutput): + elif isinstance(recv_obj, UpdateWeightReqOutput): + # If it is a weight update request, no detokenization is needed. + self.send_to_tokenizer.send_pyobj(recv_obj) + continue + elif self.tokenizer is None: + # If the tokenizer is skipped, no detokenization is needed self.send_to_tokenizer.send_pyobj(recv_obj) continue assert isinstance(recv_obj, BatchTokenIDOut) bs = len(recv_obj.rids) - if self.tokenizer is None: - # Send BatchTokenIDOut if no tokenizer init'ed. - self.send_to_tokenizer.send_pyobj(recv_obj) - continue - # Initialize decode status read_ids, surr_ids = [], [] for i in range(bs): @@ -134,6 +137,7 @@ async def handle_loop(self): spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0], ) + # Incremental decoding output_strs = [] for i in range(bs): s = self.decode_status[recv_obj.rids[i]] diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 4008a093ad..199ea7c3a6 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -21,7 +21,7 @@ import logging import multiprocessing as mp import os -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import transformers @@ -80,6 +80,7 @@ def __init__( ): self.server_args = server_args + # Init inter-process communication context = zmq.asyncio.Context(2) self.recv_from_detokenizer = context.socket(zmq.PULL) self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}") @@ -87,6 +88,7 @@ def __init__( self.send_to_router = context.socket(zmq.PUSH) self.send_to_router.connect(f"tcp://127.0.0.1:{port_args.controller_port}") + # Read model args self.model_path = server_args.model_path self.served_model_name = server_args.served_model_name self.hf_config = get_config( @@ -104,6 +106,7 @@ def __init__( else: self.context_len = get_context_length(self.hf_config) + # Create tokenizer if server_args.skip_tokenizer_init: self.tokenizer = self.processor = None else: @@ -127,6 +130,7 @@ def __init__( trust_remote_code=server_args.trust_remote_code, ) + # Store states self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} @@ -134,63 +138,6 @@ def __init__( self.model_update_lock = asyncio.Lock() self.model_update_result = None - async def get_pixel_values(self, image_data, aspect_ratio=None): - aspect_ratio = ( - getattr(self.hf_config, "image_aspect_ratio", None) - if aspect_ratio is None - else aspect_ratio - ) - grid_pinpoints = ( - self.hf_config.image_grid_pinpoints - if hasattr(self.hf_config, "image_grid_pinpoints") - and "anyres" in aspect_ratio - else None - ) - - if isinstance(image_data, list) and len(image_data) > 0: - pixel_values, image_hash, image_size = [], [], [] - if len(image_data) > 1: - aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres - for img_data in image_data: - pixel_v, image_h, image_s = await self._process_single_image( - img_data, aspect_ratio, grid_pinpoints - ) - pixel_values.append(pixel_v) - image_hash.append(image_h) - image_size.append(image_s) - pixel_values = np.stack(pixel_values, axis=0) - else: - pixel_values, image_hash, image_size = await self._process_single_image( - image_data[0], aspect_ratio, grid_pinpoints - ) - image_hash = [image_hash] - image_size = [image_size] - elif isinstance(image_data, str): - pixel_values, image_hash, image_size = await self._process_single_image( - image_data, aspect_ratio, grid_pinpoints - ) - image_hash = [image_hash] - image_size = [image_size] - else: - pixel_values, image_hash, image_size = None, None, None - - return pixel_values, image_hash, image_size - - async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints): - if self.executor is not None: - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - self.executor, - get_pixel_values, - image_data, - aspect_ratio, - grid_pinpoints, - ) - else: - return get_pixel_values( - image_data, aspect_ratio, grid_pinpoints, self.processor - ) - async def generate_request( self, obj: Union[GenerateReqInput, EmbeddingReqInput], request=None ): @@ -198,7 +145,7 @@ async def generate_request( self.create_handle_loop() while self.model_update_lock.locked(): - await asyncio.sleep(0) + await asyncio.sleep(0.001) obj.post_init() is_single = obj.is_single @@ -214,8 +161,8 @@ async def _handle_single_request( self, obj: Union[GenerateReqInput, EmbeddingReqInput], request, - index=None, - is_cache_for_prefill=False, + index: Optional[int] = None, + is_cache_for_prefill: Optional[bool] = False, ): if not is_cache_for_prefill: # The normal case with a single prompt not_use_index = index is None @@ -235,7 +182,7 @@ async def _handle_single_request( ) if self.is_generation: - pixel_values, image_hash, image_size = await self.get_pixel_values( + pixel_values, image_hash, image_size = await self._get_pixel_values( obj.image_data ) return_logprob = ( @@ -345,7 +292,7 @@ async def _handle_batch_request( parallel_sample_num = obj.parallel_sample_num if parallel_sample_num != 1: - # Send prefill requests to cache the common input + # Send prefill requests to cache the common prefix parallel_sample_num += 1 input_id_result = [] if obj.input_ids is None else None for i in range(batch_size): @@ -436,7 +383,6 @@ async def _handle_batch_request( ) # Then process the responses based on streaming option - is_stream = hasattr(obj, "stream") and obj.stream tasks = [asyncio.create_task(gen.__anext__()) for gen in generators] @@ -482,9 +428,9 @@ def _get_sampling_params(self, sampling_params_data: dict): async def _get_pixel_values(self, image_data): if isinstance(image_data, list) and len(image_data) > 0: - return await self.get_pixel_values(image_data[0]) + return await self._get_pixel_values_internal(image_data[0]) elif isinstance(image_data, str): - return await self.get_pixel_values(image_data) + return await self._get_pixel_values_internal(image_data) else: return None, None, None @@ -563,6 +509,13 @@ def flush_cache(self): req = FlushCacheReq() self.send_to_router.send_pyobj(req) + def abort_request(self, rid: str): + if rid not in self.rid_to_state: + return + del self.rid_to_state[rid] + req = AbortReq(rid) + self.send_to_router.send_pyobj(req) + async def update_weights(self, obj: UpdateWeightReqInput, request): if self.to_create_loop: self.create_handle_loop() @@ -587,13 +540,6 @@ async def update_weights(self, obj: UpdateWeightReqInput, request): else: return False, "Another update is in progress. Please try again later." - def abort_request(self, rid: str): - if rid not in self.rid_to_state: - return - del self.rid_to_state[rid] - req = AbortReq(rid) - self.send_to_router.send_pyobj(req) - def create_abort_task(self, obj: GenerateReqInput): # Abort the request if the client is disconnected. async def abort_request(): @@ -617,6 +563,8 @@ def create_handle_loop(self): loop.create_task(self.handle_loop()) async def handle_loop(self): + """The event loop that handles requests""" + while True: recv_obj: Union[ BatchStrOut, BatchEmbeddingOut, BatchTokenIDOut, UpdateWeightReqOutput @@ -713,11 +661,69 @@ def detokenize_top_logprobs_tokens(self, top_logprobs, decode_to_text: bool): ) return top_logprobs + async def _get_pixel_values_internal(self, image_data, aspect_ratio=None): + aspect_ratio = ( + getattr(self.hf_config, "image_aspect_ratio", None) + if aspect_ratio is None + else aspect_ratio + ) + grid_pinpoints = ( + self.hf_config.image_grid_pinpoints + if hasattr(self.hf_config, "image_grid_pinpoints") + and "anyres" in aspect_ratio + else None + ) + + if isinstance(image_data, list) and len(image_data) > 0: + pixel_values, image_hash, image_size = [], [], [] + if len(image_data) > 1: + aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres + for img_data in image_data: + pixel_v, image_h, image_s = await self._process_single_image( + img_data, aspect_ratio, grid_pinpoints + ) + pixel_values.append(pixel_v) + image_hash.append(image_h) + image_size.append(image_s) + pixel_values = np.stack(pixel_values, axis=0) + else: + pixel_values, image_hash, image_size = await self._process_single_image( + image_data[0], aspect_ratio, grid_pinpoints + ) + image_hash = [image_hash] + image_size = [image_size] + elif isinstance(image_data, str): + pixel_values, image_hash, image_size = await self._process_single_image( + image_data, aspect_ratio, grid_pinpoints + ) + image_hash = [image_hash] + image_size = [image_size] + else: + pixel_values, image_hash, image_size = None, None, None + + return pixel_values, image_hash, image_size + + async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints): + if self.executor is not None: + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + self.executor, + _process_single_image_task, + image_data, + aspect_ratio, + grid_pinpoints, + ) + else: + return _process_single_image_task( + image_data, aspect_ratio, grid_pinpoints, self.processor + ) + global global_processor def init_global_processor(server_args: ServerArgs): + """Init the global processor for multi modal models.""" global global_processor transformers.logging.set_verbosity_error() global_processor = get_processor( @@ -727,7 +733,7 @@ def init_global_processor(server_args: ServerArgs): ) -def get_pixel_values( +def _process_single_image_task( image_data, image_aspect_ratio=None, image_grid_pinpoints=None, processor=None ): try: @@ -759,4 +765,4 @@ def get_pixel_values( pixel_values = pixel_values.astype(np.float16) return pixel_values, image_hash, image.size except Exception: - print("Exception in TokenizerManager:\n" + get_exception_traceback()) + logger.error("Exception in TokenizerManager:\n" + get_exception_traceback()) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 19edc23b83..9820e0302e 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -56,6 +56,7 @@ from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( + configure_logger, is_multimodal_model, set_random_seed, suppress_other_loggers, @@ -145,7 +146,6 @@ def __init__( # Print info logger.info( - f"[gpu={self.gpu_id}] " f"max_total_num_tokens={self.max_total_num_tokens}, " f"max_prefill_tokens={self.max_prefill_tokens}, " f"max_running_requests={self.max_running_requests}, " @@ -284,7 +284,7 @@ def print_decode_stats(self): self.num_generated_tokens = 0 self.last_stats_tic = time.time() logger.info( - f"[gpu={self.gpu_id}] Decode batch. " + f"Decode batch. " f"#running-req: {len(self.running_batch.reqs)}, " f"#token: {num_used}, " f"token usage: {num_used / self.max_total_num_tokens:.2f}, " @@ -443,7 +443,7 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: if num_mixed_running > 0: logger.info( - f"[gpu={self.gpu_id}] Prefill batch" + f"Prefill batch" f"(mixed #running-req: {num_mixed_running}). " f"#new-seq: {len(can_run_list)}, " f"#new-token: {adder.log_input_tokens}, " @@ -453,7 +453,7 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: ) else: logger.info( - f"[gpu={self.gpu_id}] Prefill batch. " + f"Prefill batch. " f"#new-seq: {len(can_run_list)}, " f"#new-token: {adder.log_input_tokens}, " f"#cached-token: {adder.log_hit_tokens}, " @@ -631,7 +631,7 @@ def forward_decode_batch(self, batch: ScheduleBatch): self.new_token_ratio = new_token_ratio logger.info( - "decode out of memory happened, " + "Decode out of memory happened. " f"#retracted_reqs: {len(retracted_reqs)}, " f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}" ) @@ -848,7 +848,9 @@ def run_tp_server( nccl_port: int, model_overide_args: dict, ): - """Run a tensor parallel server.""" + """Run a tensor parallel model server.""" + configure_logger(server_args, prefix=f" TP{tp_rank}") + try: model_server = ModelTpServer( gpu_id, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 6b48d1f90e..fa55abba67 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -109,7 +109,7 @@ def __init__( def init_torch_distributed(self): # Init torch distributed torch.cuda.set_device(self.gpu_id) - logger.info(f"[gpu={self.gpu_id}] Init nccl begin.") + logger.info("Init nccl begin.") if not self.server_args.enable_p2p_check: monkey_patch_vllm_p2p_access_check(self.gpu_id) @@ -152,8 +152,7 @@ def init_torch_distributed(self): def load_model(self): logger.info( - f"[gpu={self.gpu_id}] Load weight begin. " - f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" + f"Load weight begin. avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) if torch.cuda.get_device_capability()[0] < 8: logger.info( @@ -208,7 +207,7 @@ def load_model(self): ) logger.info( - f"[gpu={self.gpu_id}] Load weight end. " + f"Load weight end. " f"type={type(self.model).__name__}, " f"dtype={self.dtype}, " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" @@ -224,7 +223,7 @@ def update_weights(self, model_path: str, load_format: str): from vllm.model_executor.model_loader.utils import set_default_torch_dtype logger.info( - f"[gpu={self.gpu_id}] Update weights begin. " + f"Update weights begin. " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) @@ -298,7 +297,7 @@ def model_load_weights(model, iter): self.load_config = load_config self.model_config.path = model_path - logger.info(f"[gpu={self.gpu_id}] Update weights end.") + logger.info("Update weights end.") return True, "Succeeded to update model weights" def profile_max_num_token(self, total_gpu_memory: int): @@ -387,7 +386,7 @@ def init_memory_pool( layer_num=self.model_config.num_hidden_layers, ) logger.info( - f"[gpu={self.gpu_id}] Memory pool end. " + f"Memory pool end. " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) @@ -473,9 +472,7 @@ def init_cuda_graphs(self): self.cuda_graph_runner = None return - logger.info( - f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes." - ) + logger.info("Capture cuda graph begin. This can take up to several minutes.") if self.server_args.disable_cuda_graph_padding: batch_size_list = list(range(1, 32)) + [64, 128] diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 582457ae04..f325e84b2f 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -123,7 +123,7 @@ def create_streaming_error_response( def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg): global chat_template_name - print(f"Use chat template: {chat_template_arg}") + logger.info(f"Use chat template: {chat_template_arg}") if not chat_template_exists(chat_template_arg): if not os.path.exists(chat_template_arg): raise RuntimeError( @@ -355,7 +355,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe } except Exception as e: - print("error in SGLang:", e) + logger.error("error in SGLang:", e) # Update batch status to "failed" retrieve_batch = batch_storage[batch_id] retrieve_batch.status = "failed" diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 813f2de782..021f231aa7 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -74,6 +74,7 @@ add_api_key_middleware, allocate_init_ports, assert_pkg_version, + configure_logger, enable_show_time_cost, kill_child_process, maybe_set_triton_cache_manager, @@ -270,15 +271,12 @@ def launch_server( """Launch an HTTP server.""" global tokenizer_manager - logging.basicConfig( - level=getattr(logging, server_args.log_level.upper()), - format="%(message)s", - ) + configure_logger(server_args) server_args.check_server_args() _set_envs_and_config(server_args) - # Allocate ports + # Allocate ports for inter-process communications server_args.port, server_args.additional_ports = allocate_init_ports( server_args.port, server_args.additional_ports, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 58e24dab8b..ca27f97482 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -418,7 +418,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--enable-mixed-chunk", action="store_true", - help="Enabling mixing prefill and decode in a chunked batch.", + help="Enabling mixing prefill and decode in a batch when using chunked prefill.", ) parser.add_argument( "--enable-torch-compile", diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 102dcb3d87..a6e710009f 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -692,7 +692,7 @@ def weight_loader_srt( setattr(QKVParallelLinear, "weight_loader", weight_loader_srt) -def add_api_key_middleware(app, api_key): +def add_api_key_middleware(app, api_key: str): @app.middleware("http") async def authentication(request, call_next): if request.method == "OPTIONS": @@ -704,7 +704,7 @@ async def authentication(request, call_next): return await call_next(request) -def prepare_model(model_path): +def prepare_model(model_path: str): if "SGLANG_USE_MODELSCOPE" in os.environ: if not os.path.exists(model_path): from modelscope import snapshot_download @@ -713,7 +713,7 @@ def prepare_model(model_path): return model_path -def prepare_tokenizer(tokenizer_path): +def prepare_tokenizer(tokenizer_path: str): if "SGLANG_USE_MODELSCOPE" in os.environ: if not os.path.exists(tokenizer_path): from modelscope import snapshot_download @@ -722,3 +722,13 @@ def prepare_tokenizer(tokenizer_path): tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"] ) return tokenizer_path + + +def configure_logger(server_args, prefix: str = ""): + format = f"[%(asctime)s{prefix}] %(message)s" + logging.basicConfig( + level=getattr(logging, server_args.log_level.upper()), + format=format, + datefmt="%H:%M:%S", + force=True, + ) From ab4990e4bfd79fe60815a3f872c5857df57798bb Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sun, 25 Aug 2024 14:49:23 -0700 Subject: [PATCH 080/118] [Minor] Temporarily skip flaky test (#1209) --- test/srt/run_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 5a11c8ee0f..e8edbb5500 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -6,7 +6,7 @@ suites = { "minimal": [ "models/test_embedding_models.py", - "models/test_generation_models.py", + # "models/test_generation_models.py", "sampling/penaltylib", "test_chunked_prefill.py", "test_embedding_openai_server.py", From 308d024092d8a671998b978f419dd40262bef9b5 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sun, 25 Aug 2024 16:21:37 -0700 Subject: [PATCH 081/118] [CI] Fix the issue of unit test hanging (#1211) --- python/sglang/test/test_utils.py | 17 +++++++++-------- test/srt/models/test_embedding_models.py | 6 ++++++ test/srt/models/test_generation_models.py | 13 ++++++------- test/srt/run_suite.py | 6 ++++++ 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 9f6aa68ab1..ac19d93703 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -460,24 +460,25 @@ def _target_func(): return ret_value[0] +def run_one_file(filename, out_queue): + print(f"\n\nRun {filename}\n\n") + ret = unittest.main(module=None, argv=["", "-vb"] + [filename]) + + def run_unittest_files(files: List[str], timeout_per_file: float): tic = time.time() success = True for filename in files: + out_queue = multiprocessing.Queue() + p = multiprocessing.Process(target=run_one_file, args=(filename, out_queue)) - def func(): - print(f"\n\nRun {filename}\n\n") - ret = unittest.main(module=None, argv=["", "-vb"] + [filename]) - - p = multiprocessing.Process(target=func) - - def run_one_file(): + def run_process(): p.start() p.join() try: - run_with_timeout(run_one_file, timeout=timeout_per_file) + run_with_timeout(run_process, timeout=timeout_per_file) if p.exitcode != 0: success = False break diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py index ecb3e7576e..8a43255b7f 100644 --- a/test/srt/models/test_embedding_models.py +++ b/test/srt/models/test_embedding_models.py @@ -13,6 +13,7 @@ limitations under the License. """ +import multiprocessing as mp import unittest import torch @@ -71,4 +72,9 @@ def test_prefill_logits(self): if __name__ == "__main__": + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + unittest.main(warnings="ignore") diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index 7e7e401d27..4e49c0a5b1 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -13,6 +13,7 @@ limitations under the License. """ +import multiprocessing as mp import unittest import torch @@ -108,13 +109,6 @@ def assert_close_prefill_logits_and_output_strs( ), f"Not all ROUGE-L scores are greater than {rouge_threshold}" def test_prefill_logits_and_output_strs(self): - import multiprocessing as mp - - try: - mp.set_start_method("spawn") - except RuntimeError: - pass - for ( model, tp_size, @@ -137,4 +131,9 @@ def test_prefill_logits_and_output_strs(self): if __name__ == "__main__": + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + unittest.main(warnings="ignore") diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index e8edbb5500..3756d3ddfb 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -1,5 +1,6 @@ import argparse import glob +import multiprocessing as mp from sglang.test.test_utils import run_unittest_files @@ -54,5 +55,10 @@ else: files = suites[args.suite] + try: + mp.set_start_method("spawn") + except RuntimeError: + pass + exit_code = run_unittest_files(files, args.timeout_per_file) exit(exit_code) From 15f1a49d2dcbd488155de373e7fcf854f29a7de8 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 25 Aug 2024 16:43:07 -0700 Subject: [PATCH 082/118] Update CI workflows (#1210) --- .github/workflows/accuracy-test.yml | 7 +-- .github/workflows/e2e-test.yml | 11 ++-- .github/workflows/moe-test.yml | 50 +++++++++---------- .github/workflows/unit-test.yml | 9 ++-- python/sglang/test/runners.py | 3 +- python/sglang/test/test_utils.py | 1 + .../test_srt_endpoint_with_penalizers.py | 8 ++- test/srt/test_vision_openai_server.py | 2 +- 8 files changed, 43 insertions(+), 48 deletions(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 16bb584f4a..b60a9c6d48 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: accuracy-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: accuracy + runs-on: accuracy-test steps: - name: Checkout code @@ -28,9 +28,6 @@ jobs: - name: Install dependencies run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall @@ -40,7 +37,7 @@ jobs: pip install -e . - name: Evaluate Accuracy + timeout-minutes: 20 run: | cd test/srt python3 test_eval_accuracy_large.py - timeout-minutes: 20 diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index ad271c37ed..8d33870411 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: e2e-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: e2e + runs-on: e2e-test steps: - name: Checkout code @@ -28,27 +28,24 @@ jobs: - name: Install dependencies run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Benchmark Serving Throughput + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default - timeout-minutes: 10 - name: Benchmark Serving Throughput (w/o RadixAttention) + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache - timeout-minutes: 10 - name: Benchmark Serving Throughput (w/o ChunkedPrefill) + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill - timeout-minutes: 10 diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index 51f7d02261..dd5665a3f5 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -18,30 +18,28 @@ concurrency: cancel-in-progress: true jobs: - moe-test: - if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: accuracy - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + moe-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: moe-test - - name: Benchmark MOE Serving Throughput - uses: nick-fields/retry@v3 - with: - timeout_minutes: 15 - max_attempts: 2 - retry_on: error - command: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark MoE Serving Throughput + timeout_minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + + - name: Benchmark MoE Serving Throughput (w/o RadixAttention) + timeout_minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 607cb865db..e2d7951be4 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: unit-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: unit + runs-on: unit-test steps: - name: Checkout code @@ -28,9 +28,6 @@ jobs: - name: Install dependencies run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall @@ -38,13 +35,13 @@ jobs: pip install sentence_transformers - name: Test Backend Runtime + timeout-minutes: 20 run: | cd test/srt python3 run_suite.py --suite minimal - timeout-minutes: 20 - name: Test Frontend Language + timeout-minutes: 10 run: | cd test/lang python3 run_suite.py --suite minimal - timeout-minutes: 10 diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 9a5bd4fd59..37ed2cf9ad 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -24,6 +24,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from sglang.srt.server import Runtime +from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER DEFAULT_PROMPTS = [ # the output of gemma-2-2b from SRT is unstable on the commented prompt @@ -171,7 +172,7 @@ def __init__( torch_dtype, is_generation, tp_size=1, - port=5157, + port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER, ): self.is_generation = is_generation self.runtime = Runtime( diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index ac19d93703..3389e619c9 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -25,6 +25,7 @@ DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157 DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" diff --git a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py index e72dc30f95..4e91f72355 100644 --- a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py +++ b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py @@ -5,7 +5,11 @@ import requests from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, + popen_launch_server, +) class TestBatchPenalizerE2E(unittest.TestCase): @@ -13,7 +17,7 @@ class TestBatchPenalizerE2E(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://127.0.0.1:{8157}" + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0f136fe6e5..48157b8db4 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -67,7 +67,7 @@ def test_chat_completion(self): assert response.choices[0].message.role == "assistant" text = response.choices[0].message.content assert isinstance(text, str) - assert "car" in text or "taxi" in text, text + assert "logo" in text, text assert response.id assert response.created assert response.usage.prompt_tokens > 0 From 61bb223e0fc1ccd0c26ac3137f0d9154bcecc25a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 25 Aug 2024 17:31:52 -0700 Subject: [PATCH 083/118] Update CI runner docs (#1213) --- .github/workflows/moe-test.yml | 4 +- docs/en/setup_github_runner.md | 101 +++++++++------------------------ 2 files changed, 30 insertions(+), 75 deletions(-) diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index dd5665a3f5..2caa3d1820 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -33,13 +33,13 @@ jobs: pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Benchmark MoE Serving Throughput - timeout_minutes: 10 + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default - name: Benchmark MoE Serving Throughput (w/o RadixAttention) - timeout_minutes: 10 + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/docs/en/setup_github_runner.md b/docs/en/setup_github_runner.md index 97a7f26266..282e12b736 100644 --- a/docs/en/setup_github_runner.md +++ b/docs/en/setup_github_runner.md @@ -1,89 +1,44 @@ -# Set up self hosted runner for GitHub Action +# Set Up Self-hosted Runners for GitHub Action -## Config Runner +## Add a Runner -```bash -# https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux -# Involves some TOKEN and other private information, click the link to view specific steps. -``` +### Step 1: Start a docker container. -## Start Runner +You can mount a folder for the shared huggingface model weights cache. The command below uses `/tmp/huggingface` as an example. -add `/lib/systemd/system/e2e.service` ``` -[Unit] -StartLimitIntervalSec=0 -[Service] -Environment="CUDA_VISIBLE_DEVICES=7" -Environment="XDG_CACHE_HOME=/data/.cache" -Environment="HF_TOKEN=hf_xx" -Environment="OPENAI_API_KEY=sk-xx" -Environment="HOME=/data/zhyncs/runner-v1" -Environment="SGLANG_IS_IN_CI=true" -Restart=always -RestartSec=1 -ExecStart=/data/zhyncs/runner-v1/actions-runner/run.sh -[Install] -WantedBy=multi-user.target +docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04 +docker run --shm-size 64g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash ``` -add `/lib/systemd/system/unit.service` -``` -[Unit] -StartLimitIntervalSec=0 -[Service] -Environment="CUDA_VISIBLE_DEVICES=6" -Environment="XDG_CACHE_HOME=/data/.cache" -Environment="HF_TOKEN=hf_xx" -Environment="OPENAI_API_KEY=sk-xx" -Environment="HOME=/data/zhyncs/runner-v2" -Environment="SGLANG_IS_IN_CI=true" -Restart=always -RestartSec=1 -ExecStart=/data/zhyncs/runner-v2/actions-runner/run.sh -[Install] -WantedBy=multi-user.target -``` +### Step 2: Configure the runner by `config.sh` + +Run these commands inside the container. -add `/lib/systemd/system/accuracy.service` ``` -[Unit] -StartLimitIntervalSec=0 -[Service] -Environment="CUDA_VISIBLE_DEVICES=5" -Environment="XDG_CACHE_HOME=/data/.cache" -Environment="HF_TOKEN=hf_xx" -Environment="OPENAI_API_KEY=sk-xx" -Environment="HOME=/data/zhyncs/runner-v3" -Environment="SGLANG_IS_IN_CI=true" -Restart=always -RestartSec=1 -ExecStart=/data/zhyncs/runner-v3/actions-runner/run.sh -[Install] -WantedBy=multi-user.target +apt update && apt install -y curl python3-pip git +export RUNNER_ALLOW_RUNASROOT=1 ``` -```bash -cd /data/zhyncs/runner-v1 -python3 -m venv venv +Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux to run `config.sh` -cd /data/zhyncs/runner-v2 -python3 -m venv venv +**Notes** +- Do not need to specify the runner group +- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `unit-test`). The labels can be editted later in Github Settings. +- Do not need to change the work folder. -cd /data/zhyncs/runner-v3 -python3 -m venv venv +### Step 3: Run the runner by `run.sh` -sudo systemctl daemon-reload - -sudo systemctl start e2e -sudo systemctl enable e2e -sudo systemctl status e2e - -sudo systemctl start unit -sudo systemctl enable unit -sudo systemctl status unit +- Set up environment variables +``` +export HF_HOME=/hf_home +export SGLANG_IS_IN_CI=true +export HF_TOKEN=hf_xxx +export OPENAI_API_KEY=sk-xxx +export CUDA_VISIBLE_DEVICES=0 +``` -sudo systemctl start accuracy -sudo systemctl enable accuracy -sudo systemctl status accuracy +- Run it forever ``` +while true; do ./run.sh; echo "Restarting..."; sleep 2; done +``` \ No newline at end of file From 2c615d120fa5da4ff6b88f59ca7656b8d595ccd0 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Mon, 26 Aug 2024 08:38:11 +0800 Subject: [PATCH 084/118] [Feature] Support fp8 e5m2 kv cache with flashinfer (#1204) Co-authored-by: Yineng Zhang --- python/sglang/srt/layers/radix_attention.py | 7 +- python/sglang/srt/mem_cache/memory_pool.py | 90 +++++++++++++++++-- .../srt/model_executor/forward_batch_info.py | 4 + .../sglang/srt/model_executor/model_runner.py | 23 ++++- python/sglang/srt/server_args.py | 8 ++ 5 files changed, 116 insertions(+), 16 deletions(-) diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index a02673dc37..91735a1b81 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -203,7 +203,6 @@ def forward(self, q, k, v, input_metadata: InputMetadata): return self.decode_forward(q, k, v, input_metadata) def store_kv_cache(self, cache_k, cache_v, input_metadata: InputMetadata): - k_cache = input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id) - v_cache = input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id) - k_cache[input_metadata.out_cache_loc] = cache_k - v_cache[input_metadata.out_cache_loc] = cache_v + input_metadata.token_to_kv_pool.set_kv_buffer( + self.layer_id, input_metadata.out_cache_loc, cache_k, cache_v + ) diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index 68cefbbf9f..fef74321ac 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -16,7 +16,8 @@ """Memory pool.""" import logging -from typing import List, Union +from abc import ABC, abstractmethod +from typing import List, Tuple, Union import torch @@ -52,14 +53,21 @@ def clear(self): self.free_slots = list(range(self.size)) -class BaseTokenToKVPool: +class BaseTokenToKVPool(ABC): """A memory pool that maps a token to its kv cache locations""" def __init__( self, size: int, + dtype: torch.dtype, ): self.size = size + self.dtype = dtype + if dtype == torch.float8_e5m2: + # NOTE: Store as torch.uint8 because Tensor index_put is not implemented for torch.float8_e5m2 + self.store_dtype = torch.uint8 + else: + self.store_dtype = dtype # We also add one slot. This slot is used for writing dummy output from padded tokens. self.mem_state = torch.ones((self.size + 1,), dtype=torch.bool, device="cuda") @@ -112,6 +120,28 @@ def clear(self): # We also add one slot. This slot is used for writing dummy output from padded tokens. self.mem_state[0] = False + @abstractmethod + def get_key_buffer(self, layer_id: int) -> torch.Tensor: + raise NotImplementedError() + + @abstractmethod + def get_value_buffer(self, layer_id: int) -> torch.Tensor: + raise NotImplementedError() + + @abstractmethod + def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError() + + @abstractmethod + def set_kv_buffer( + self, + layer_id: int, + loc: torch.Tensor, + cache_k: torch.Tensor, + cache_v: torch.Tensor, + ) -> None: + raise NotImplementedError() + class MHATokenToKVPool(BaseTokenToKVPool): @@ -123,26 +153,52 @@ def __init__( head_dim: int, layer_num: int, ): - super().__init__(size) + super().__init__(size, dtype) # [size, head_num, head_dim] for each layer self.k_buffer = [ - torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda") + torch.empty( + (size + 1, head_num, head_dim), dtype=self.store_dtype, device="cuda" + ) for _ in range(layer_num) ] self.v_buffer = [ - torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda") + torch.empty( + (size + 1, head_num, head_dim), dtype=self.store_dtype, device="cuda" + ) for _ in range(layer_num) ] def get_key_buffer(self, layer_id: int): + if self.store_dtype != self.dtype: + return self.k_buffer[layer_id].view(self.dtype) return self.k_buffer[layer_id] def get_value_buffer(self, layer_id: int): + if self.store_dtype != self.dtype: + return self.v_buffer[layer_id].view(self.dtype) return self.v_buffer[layer_id] def get_kv_buffer(self, layer_id: int): - return self.k_buffer[layer_id], self.v_buffer[layer_id] + return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id) + + def set_kv_buffer( + self, + layer_id: int, + loc: torch.Tensor, + cache_k: torch.Tensor, + cache_v: torch.Tensor, + ): + if cache_k.dtype != self.dtype: + cache_k = cache_k.to(self.dtype) + if cache_v.dtype != self.dtype: + cache_v = cache_v.to(self.dtype) + if self.store_dtype != self.dtype: + self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype) + self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype) + else: + self.k_buffer[layer_id][loc] = cache_k + self.v_buffer[layer_id][loc] = cache_v class MLATokenToKVPool(BaseTokenToKVPool): @@ -155,23 +211,41 @@ def __init__( qk_rope_head_dim: int, layer_num: int, ): - super().__init__(size) + super().__init__(size, dtype) self.kv_lora_rank = kv_lora_rank self.kv_buffer = [ torch.empty( (size + 1, 1, kv_lora_rank + qk_rope_head_dim), - dtype=dtype, + dtype=self.store_dtype, device="cuda", ) for _ in range(layer_num) ] def get_key_buffer(self, layer_id: int): + if self.store_dtype != self.dtype: + return self.kv_buffer[layer_id].view(self.dtype) return self.kv_buffer[layer_id] def get_value_buffer(self, layer_id: int): + if self.store_dtype != self.dtype: + return self.kv_buffer[layer_id][..., : self.kv_lora_rank].view(self.dtype) return self.kv_buffer[layer_id][..., : self.kv_lora_rank] def get_kv_buffer(self, layer_id: int): return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id) + + def set_kv_buffer( + self, + layer_id: int, + loc: torch.Tensor, + cache_k: torch.Tensor, + cache_v: torch.Tensor, + ): + if cache_k.dtype != self.dtype: + cache_k = cache_k.to(self.dtype) + if self.store_dtype != self.dtype: + self.kv_buffer[layer_id][loc] = cache_k.view(self.store_dtype) + else: + self.kv_buffer[layer_id][loc] = cache_k diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index 98daeaece4..c107b3bc82 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -315,6 +315,8 @@ def update_flashinfer_indices( num_kv_heads, head_dim, 1, + data_type=model_runner.kv_cache_dtype, + q_data_type=model_runner.dtype, ) else: # extend part @@ -393,6 +395,8 @@ def update_flashinfer_indices( num_kv_heads, head_dim, 1, + data_type=model_runner.kv_cache_dtype, + q_data_type=model_runner.dtype, ) else: # extend part diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fa55abba67..fecfc2b430 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -311,7 +311,7 @@ def profile_max_num_token(self, total_gpu_memory: int): cell_size = ( (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim) * self.model_config.num_hidden_layers - * torch._utils._element_size(self.dtype) + * torch._utils._element_size(self.kv_cache_dtype) ) else: cell_size = ( @@ -319,7 +319,7 @@ def profile_max_num_token(self, total_gpu_memory: int): * self.model_config.head_dim * self.model_config.num_hidden_layers * 2 - * torch._utils._element_size(self.dtype) + * torch._utils._element_size(self.kv_cache_dtype) ) rest_memory = available_gpu_memory - total_gpu_memory * ( 1 - self.mem_fraction_static @@ -333,6 +333,21 @@ def init_memory_pool( max_num_reqs: int = None, max_total_tokens: int = None, ): + if self.server_args.kv_cache_dtype == "auto": + self.kv_cache_dtype = self.dtype + elif self.server_args.kv_cache_dtype == "fp8_e5m2": + if self.server_args.disable_flashinfer or self.server_args.enable_mla: + logger.warning( + "FP8 KV cache is not supported for Triton kernel now, using auto kv cache dtype" + ) + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = torch.float8_e5m2 + else: + raise ValueError( + f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}." + ) + self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory) if max_total_tokens is not None: if max_total_tokens > self.max_total_num_tokens: @@ -369,7 +384,7 @@ def init_memory_pool( ): self.token_to_kv_pool = MLATokenToKVPool( self.max_total_num_tokens, - dtype=self.dtype, + dtype=self.kv_cache_dtype, kv_lora_rank=self.model_config.kv_lora_rank, qk_rope_head_dim=self.model_config.qk_rope_head_dim, layer_num=self.model_config.num_hidden_layers, @@ -380,7 +395,7 @@ def init_memory_pool( else: self.token_to_kv_pool = MHATokenToKVPool( self.max_total_num_tokens, - dtype=self.dtype, + dtype=self.kv_cache_dtype, head_num=self.model_config.get_num_kv_heads(self.tp_size), head_dim=self.model_config.head_dim, layer_num=self.model_config.num_hidden_layers, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ca27f97482..8a56c02e16 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -33,6 +33,7 @@ class ServerArgs: skip_tokenizer_init: bool = False load_format: str = "auto" dtype: str = "auto" + kv_cache_dtype: str = "auto" trust_remote_code: bool = True context_length: Optional[int] = None quantization: Optional[str] = None @@ -196,6 +197,13 @@ def add_cli_args(parser: argparse.ArgumentParser): '* "float" is shorthand for FP32 precision.\n' '* "float32" for FP32 precision.', ) + parser.add_argument( + "--kv-cache-dtype", + type=str, + default=ServerArgs.kv_cache_dtype, + choices=["auto", "fp8_e5m2"], + help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.', + ) parser.add_argument( "--trust-remote-code", action="store_true", From d3efcb3930cfb1c79958dda00ce3e044fd85b714 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 25 Aug 2024 17:45:35 -0700 Subject: [PATCH 085/118] Update workflow files (#1214) --- .github/workflows/accuracy-test.yml | 2 +- .github/workflows/e2e-test.yml | 2 +- .github/workflows/moe-test.yml | 2 +- .github/workflows/unit-test.yml | 2 +- docs/en/setup_github_runner.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index b60a9c6d48..6e1818c9c9 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: accuracy-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: accuracy-test + runs-on: 1-gpu-runner steps: - name: Checkout code diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 8d33870411..2db6801c76 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: e2e-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: e2e-test + runs-on: 1-gpu-runner steps: - name: Checkout code diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index 2caa3d1820..111f190c7c 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: moe-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: moe-test + runs-on: 2-gpu-runner steps: - name: Checkout code diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index e2d7951be4..752c05da75 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: unit-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: unit-test + runs-on: 1-gpu-runner steps: - name: Checkout code diff --git a/docs/en/setup_github_runner.md b/docs/en/setup_github_runner.md index 282e12b736..8e817dcc88 100644 --- a/docs/en/setup_github_runner.md +++ b/docs/en/setup_github_runner.md @@ -24,7 +24,7 @@ Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?a **Notes** - Do not need to specify the runner group -- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `unit-test`). The labels can be editted later in Github Settings. +- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be editted later in Github Settings. - Do not need to change the work folder. ### Step 3: Run the runner by `run.sh` From 158e8f1e2d499e225add6ed0554896c94fd5a891 Mon Sep 17 00:00:00 2001 From: Mingyi Date: Sun, 25 Aug 2024 19:02:08 -0700 Subject: [PATCH 086/118] improve the threshold and ports in tests (#1215) --- python/sglang/test/test_utils.py | 12 +++----- .../test_srt_endpoint_with_penalizers.py | 7 +++-- test/srt/test_chunked_prefill.py | 7 +++-- test/srt/test_embedding_openai_server.py | 13 +++++++-- test/srt/test_eval_accuracy_large.py | 8 ++--- ...est_eval_accuracy_large_chunked_prefill.py | 7 +++-- ...al_accuracy_large_mixed_chunked_prefill.py | 7 +++-- test/srt/test_eval_accuracy_mini.py | 9 ++++-- test/srt/test_large_max_new_tokens.py | 7 +++-- test/srt/test_moe_serving_throughput.py | 29 +++++++------------ test/srt/test_openai_server.py | 10 +++++-- test/srt/test_serving_throughput.py | 22 ++++++++------ test/srt/test_skip_tokenizer_init.py | 10 +++++-- test/srt/test_srt_endpoint.py | 9 ++++-- test/srt/test_torch_compile.py | 10 +++++-- test/srt/test_triton_attn_backend.py | 10 +++++-- test/srt/test_update_weights.py | 9 ++++-- test/srt/test_vision_openai_server.py | 22 +++++++++----- 18 files changed, 122 insertions(+), 86 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 3389e619c9..373b7c1a57 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -23,18 +23,14 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" +DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 if os.getenv("SGLANG_IS_IN_CI", "false") == "true": DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157 - DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" - DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" - DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" - DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157" + DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157" else: - DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:1157" - DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:1257" - DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:1357" - DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:1457" + DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157 + DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157" def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): diff --git a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py index 4e91f72355..2f5b352ae9 100644 --- a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py +++ b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py @@ -7,7 +7,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -17,11 +18,11 @@ class TestBatchPenalizerE2E(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=( "--random-seed", "0", diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 8d81dc0c3e..2eb704dc91 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -5,7 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -20,11 +21,11 @@ def run_mmlu(self, disable_radix_cache, enable_mixed_chunk): other_args += ["--enable-mixed-chunk"] model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_UNIT_TEST + base_url = DEFAULT_URL_FOR_TEST process = popen_launch_server( model, base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=other_args, ) diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py index fd8fec48e9..45f7850da9 100644 --- a/test/srt/test_embedding_openai_server.py +++ b/test/srt/test_embedding_openai_server.py @@ -4,17 +4,24 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = "intfloat/e5-mistral-7b-instruct" - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=300, api_key=cls.api_key + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, ) cls.base_url += "/v1" cls.tokenizer = get_tokenizer(cls.model) diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 470ed11aa4..3729ad26b6 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -5,8 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_ACCURACY_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -15,11 +15,11 @@ class TestEvalAccuracyLarge(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=["--log-level-http", "warning"], ) diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index 951f481da3..02df2a7f56 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -5,7 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_ACCURACY_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -14,11 +15,11 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"], ) diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py index 210c32b519..8ba71e5c83 100644 --- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py @@ -5,7 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_ACCURACY_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -14,11 +15,11 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_ACCURACY_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--log-level-http", "warning", diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py index a4219b1a0a..25aa0ca116 100644 --- a/test/srt/test_eval_accuracy_mini.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -5,7 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -14,8 +15,10 @@ class TestEvalAccuracyMini(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST - cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + ) @classmethod def tearDownClass(cls): diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index f29adabced..10b82706a6 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -10,7 +10,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -19,12 +20,12 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, api_key=cls.api_key, other_args=("--max-total-token", "1024"), env={"SGLANG_CLIP_MAX_NEW_TOKENS": "256", **os.environ}, diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 3cdf724f31..4f6e8db82c 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -7,7 +7,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MOE_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_MOE_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -25,9 +26,12 @@ def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size other_args.append("--enable-p2p-check") model = DEFAULT_MOE_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_MOE_TEST + base_url = DEFAULT_URL_FOR_TEST process = popen_launch_server( - model, base_url, timeout=300, other_args=other_args + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, ) # Run benchmark @@ -72,8 +76,8 @@ def test_default(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE) performance - assert res["output_throughput"] > 910 + # A100 (PCIE): 950, H100 (SMX): 1800 + assert res["output_throughput"] > 1750 def test_default_without_radix_cache(self): res = self.run_test( @@ -83,19 +87,8 @@ def test_default_without_radix_cache(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE) performance - assert res["output_throughput"] > 910 - - def test_default_without_chunked_prefill(self): - res = self.run_test( - disable_radix_cache=ServerArgs.disable_radix_cache, - disable_flashinfer=ServerArgs.disable_flashinfer, - chunked_prefill_size=-1, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE) performance - print(res["output_throughput"]) + # A100 (PCIE): 950, H100 (SMX): 1900 + assert res["output_throughput"] > 1850 def test_all_cases(self): for disable_radix_cache in [False, True]: diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 828f5ab532..ce130956de 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -8,7 +8,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -17,10 +18,13 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=300, api_key=cls.api_key + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, ) cls.base_url += "/v1" cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST) diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 261ac6ec52..f1089a6a7b 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -7,7 +7,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_E2E_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -23,9 +24,12 @@ def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_E2E_TEST + base_url = DEFAULT_URL_FOR_TEST process = popen_launch_server( - model, base_url, timeout=300, other_args=other_args + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, ) # Run benchmark @@ -70,8 +74,8 @@ def test_default(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE) performance - assert res["output_throughput"] > 1400 + # A100 (PCIE): 1450, H100 (SMX): 2550 + assert res["output_throughput"] > 2500 def test_default_without_radix_cache(self): res = self.run_test( @@ -81,8 +85,8 @@ def test_default_without_radix_cache(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE) performance - assert res["output_throughput"] > 1450 + # A100 (PCIE): 1500, H100 (SMX): 2850 + assert res["output_throughput"] > 2800 def test_default_without_chunked_prefill(self): res = self.run_test( @@ -92,8 +96,8 @@ def test_default_without_chunked_prefill(self): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE) performance - assert res["output_throughput"] > 1400 + # A100 (PCIE): 1450, H100 (SMX): 2550 + assert res["output_throughput"] > 2500 def test_all_cases(self): for disable_radix_cache in [False, True]: diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 7501056151..b159bb5578 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -6,7 +6,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -15,9 +16,12 @@ class TestSkipTokenizerInit(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"] + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--skip-tokenizer-init"], ) @classmethod diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 60f4cd58a3..818aae2151 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -6,7 +6,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -15,8 +16,10 @@ class TestSRTEndpoint(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST - cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + ) @classmethod def tearDownClass(cls): diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 5133d3cd3c..26daf4fa57 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -5,7 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -14,9 +15,12 @@ class TestTorchCompile(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--enable-torch-compile"], ) @classmethod diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py index 7a453d8be7..a94ca92124 100644 --- a/test/srt/test_triton_attn_backend.py +++ b/test/srt/test_triton_attn_backend.py @@ -5,7 +5,8 @@ from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -14,9 +15,12 @@ class TestTritonAttnBackend(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"] + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--disable-flashinfer"], ) @classmethod diff --git a/test/srt/test_update_weights.py b/test/srt/test_update_weights.py index 64f84263aa..7b8404c735 100644 --- a/test/srt/test_update_weights.py +++ b/test/srt/test_update_weights.py @@ -6,7 +6,8 @@ from sglang.srt.utils import kill_child_process from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_URL_FOR_UNIT_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, popen_launch_server, ) @@ -15,8 +16,10 @@ class TestReplaceWeights(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST - cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + ) @classmethod def tearDownClass(cls): diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 48157b8db4..a345717760 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -11,19 +11,23 @@ from PIL import Image from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestOpenAIVisionServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = "lmms-lab/llava-onevision-qwen2-0.5b-ov" - cls.base_url = DEFAULT_URL_FOR_UNIT_TEST + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, - timeout=300, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, api_key=cls.api_key, other_args=[ "--chat-template", @@ -67,7 +71,7 @@ def test_chat_completion(self): assert response.choices[0].message.role == "assistant" text = response.choices[0].message.content assert isinstance(text, str) - assert "logo" in text, text + assert "man" in text or "cab" in text, text assert response.id assert response.created assert response.usage.prompt_tokens > 0 @@ -86,18 +90,19 @@ def test_mult_images_chat_completion(self): { "type": "image_url", "image_url": { - "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png" }, }, { "type": "image_url", "image_url": { - "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png" + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" }, }, { "type": "text", - "text": "I have shown you two images. Please describe the two images to me.", + "text": "I have two very different images. They are not related at all. " + "Please describe the first image in one sentence, and then describe the second image in another sentence.", }, ], }, @@ -108,8 +113,9 @@ def test_mult_images_chat_completion(self): assert response.choices[0].message.role == "assistant" text = response.choices[0].message.content assert isinstance(text, str) + print(text) assert "man" in text or "cab" in text, text - assert "logo" in text, text + # assert "logo" in text, text assert response.id assert response.created assert response.usage.prompt_tokens > 0 From 7514b9f8d3660417c085538076cf5162f32ce2fb Mon Sep 17 00:00:00 2001 From: Mingyi Date: Sun, 25 Aug 2024 19:56:42 -0700 Subject: [PATCH 087/118] [CI] Fix CI (#1217) --- python/sglang/test/test_utils.py | 35 +++++++++---------- test/srt/models/test_embedding_models.py | 2 +- test/srt/models/test_generation_models.py | 2 +- test/srt/run_suite.py | 8 ++--- .../test_srt_endpoint_with_penalizers.py | 2 +- 5 files changed, 22 insertions(+), 27 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 373b7c1a57..0468863996 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -2,12 +2,10 @@ import argparse import asyncio -import multiprocessing import os import subprocess import threading import time -import unittest from functools import partial from typing import Callable, List, Optional @@ -19,6 +17,7 @@ from sglang.global_config import global_config from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint +from sglang.srt.utils import kill_child_process from sglang.utils import get_exception_traceback DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" @@ -457,35 +456,35 @@ def _target_func(): return ret_value[0] -def run_one_file(filename, out_queue): - print(f"\n\nRun {filename}\n\n") - ret = unittest.main(module=None, argv=["", "-vb"] + [filename]) - - def run_unittest_files(files: List[str], timeout_per_file: float): tic = time.time() success = True for filename in files: - out_queue = multiprocessing.Queue() - p = multiprocessing.Process(target=run_one_file, args=(filename, out_queue)) + global process - def run_process(): - p.start() - p.join() + def run_one_file(filename): + filename = os.path.join(os.getcwd(), filename) + print(f"\n\nRun {filename}\n\n") + process = subprocess.Popen( + ["python3", filename], stdout=None, stderr=None, env=os.environ + ) + process.wait() + return process.returncode try: - run_with_timeout(run_process, timeout=timeout_per_file) - if p.exitcode != 0: - success = False - break + ret_code = run_with_timeout( + run_one_file, args=(filename,), timeout=timeout_per_file + ) + assert ret_code == 0 except TimeoutError: - p.terminate() + kill_child_process(process.pid) time.sleep(5) print( f"\nTimeout after {timeout_per_file} seconds when running {filename}\n" ) - return False + success = False + break if success: print(f"Success. Time elapsed: {time.time() - tic:.2f}s") diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py index 8a43255b7f..a5a73bf319 100644 --- a/test/srt/models/test_embedding_models.py +++ b/test/srt/models/test_embedding_models.py @@ -77,4 +77,4 @@ def test_prefill_logits(self): except RuntimeError: pass - unittest.main(warnings="ignore") + unittest.main() diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index 4e49c0a5b1..b953ccf5d6 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -136,4 +136,4 @@ def test_prefill_logits_and_output_strs(self): except RuntimeError: pass - unittest.main(warnings="ignore") + unittest.main() diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 3756d3ddfb..4e11d8da25 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -7,7 +7,7 @@ suites = { "minimal": [ "models/test_embedding_models.py", - # "models/test_generation_models.py", + "models/test_generation_models.py", "sampling/penaltylib", "test_chunked_prefill.py", "test_embedding_openai_server.py", @@ -33,6 +33,7 @@ tests.remove(target_suite_name) tests.extend(target_tests) + if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument( @@ -55,10 +56,5 @@ else: files = suites[args.suite] - try: - mp.set_start_method("spawn") - except RuntimeError: - pass - exit_code = run_unittest_files(files, args.timeout_per_file) exit(exit_code) diff --git a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py index 2f5b352ae9..e3496102cb 100644 --- a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py +++ b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py @@ -112,4 +112,4 @@ def test_repetition_penalty(self): if __name__ == "__main__": - unittest.main(warnings="ignore") + unittest.main() From 3579162ab102351b8cac5d17eab29e05fee63abe Mon Sep 17 00:00:00 2001 From: Kaichen Zhang - NTU Date: Mon, 26 Aug 2024 11:58:51 +0800 Subject: [PATCH 088/118] [Fix] Multi-images loading error (#1218) --- python/sglang/srt/managers/tokenizer_manager.py | 2 +- test/srt/test_vision_openai_server.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 199ea7c3a6..3f25ad5607 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -428,7 +428,7 @@ def _get_sampling_params(self, sampling_params_data: dict): async def _get_pixel_values(self, image_data): if isinstance(image_data, list) and len(image_data) > 0: - return await self._get_pixel_values_internal(image_data[0]) + return await self._get_pixel_values_internal(image_data) elif isinstance(image_data, str): return await self._get_pixel_values_internal(image_data) else: diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index a345717760..0003e4776a 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -114,8 +114,8 @@ def test_mult_images_chat_completion(self): text = response.choices[0].message.content assert isinstance(text, str) print(text) - assert "man" in text or "cab" in text, text - # assert "logo" in text, text + assert "man" in text and "taxi" in text, text + assert "logo" in text, text assert response.id assert response.created assert response.usage.prompt_tokens > 0 From 632d506d0b526f641f9ced4f408dad8bd64b5009 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sun, 25 Aug 2024 21:26:31 -0700 Subject: [PATCH 089/118] minor: improve CI and dependencies (#1212) --- .github/workflows/unit-test.yml | 4 +--- python/pyproject.toml | 2 +- python/sglang/srt/managers/tokenizer_manager.py | 8 +++----- python/sglang/test/test_utils.py | 9 +++++---- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 752c05da75..6ab3716033 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -29,10 +29,8 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip - pip install -e "python[all]" + pip install -e "python[dev]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - pip install accelerate - pip install sentence_transformers - name: Test Backend Runtime timeout-minutes: 20 diff --git a/python/pyproject.toml b/python/pyproject.toml index 6b1b032fdc..4908ad051f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -27,7 +27,7 @@ srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "intere openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] -test = ["jsonlines", "matplotlib", "pandas"] +test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"] all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] dev = ["sglang[all]", "sglang[test]"] diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 3f25ad5607..c74251947b 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -427,12 +427,10 @@ def _get_sampling_params(self, sampling_params_data: dict): return sampling_params async def _get_pixel_values(self, image_data): - if isinstance(image_data, list) and len(image_data) > 0: - return await self._get_pixel_values_internal(image_data) - elif isinstance(image_data, str): - return await self._get_pixel_values_internal(image_data) - else: + if image_data is None: return None, None, None + else: + return await self._get_pixel_values_internal(image_data) async def _wait_for_response( self, diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 0468863996..59e2ab2924 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -465,7 +465,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float): def run_one_file(filename): filename = os.path.join(os.getcwd(), filename) - print(f"\n\nRun {filename}\n\n") + print(f"\n\nRun {filename}\n\n", flush=True) process = subprocess.Popen( ["python3", filename], stdout=None, stderr=None, env=os.environ ) @@ -481,15 +481,16 @@ def run_one_file(filename): kill_child_process(process.pid) time.sleep(5) print( - f"\nTimeout after {timeout_per_file} seconds when running {filename}\n" + f"\nTimeout after {timeout_per_file} seconds when running {filename}\n", + flush=True, ) success = False break if success: - print(f"Success. Time elapsed: {time.time() - tic:.2f}s") + print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True) else: - print(f"Fail. Time elapsed: {time.time() - tic:.2f}s") + print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True) return 0 if success else -1 From 97589a60a2cf2ef75d26ca0de9a78f30e2b63c4e Mon Sep 17 00:00:00 2001 From: Mingyi Date: Sun, 25 Aug 2024 21:54:02 -0700 Subject: [PATCH 090/118] [CI] Parallelize unit tests in CI (#1219) --- .github/workflows/accuracy-test.yml | 30 ++++++++--------- .github/workflows/e2e-test.yml | 52 ++++++++++++++--------------- .github/workflows/moe-test.yml | 34 +++++++++---------- .github/workflows/unit-test.yml | 49 ++++++++++++++++----------- python/sglang/test/test_utils.py | 2 +- test/srt/run_suite.py | 15 ++++++++- 6 files changed, 103 insertions(+), 79 deletions(-) diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 6e1818c9c9..6fb102a4c5 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -23,21 +23,21 @@ jobs: runs-on: 1-gpu-runner steps: - - name: Checkout code - uses: actions/checkout@v3 + - name: Checkout code + uses: actions/checkout@v3 - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - git clone https://github.com/merrymercy/human-eval.git - cd human-eval - pip install -e . + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . - - name: Evaluate Accuracy - timeout-minutes: 20 - run: | - cd test/srt - python3 test_eval_accuracy_large.py + - name: Evaluate Accuracy + timeout-minutes: 20 + run: | + cd test/srt + python3 test_eval_accuracy_large.py diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 2db6801c76..7f555110d9 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -23,29 +23,29 @@ jobs: runs-on: 1-gpu-runner steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - - name: Benchmark Serving Throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default - - - name: Benchmark Serving Throughput (w/o RadixAttention) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache - - - name: Benchmark Serving Throughput (w/o ChunkedPrefill) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark Serving Throughput + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default + + - name: Benchmark Serving Throughput (w/o RadixAttention) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache + + - name: Benchmark Serving Throughput (w/o ChunkedPrefill) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index 111f190c7c..4440aa215f 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -23,23 +23,23 @@ jobs: runs-on: 2-gpu-runner steps: - - name: Checkout code - uses: actions/checkout@v3 + - name: Checkout code + uses: actions/checkout@v3 - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - name: Benchmark MoE Serving Throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + - name: Benchmark MoE Serving Throughput + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default - - name: Benchmark MoE Serving Throughput (w/o RadixAttention) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache + - name: Benchmark MoE Serving Throughput (w/o RadixAttention) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 6ab3716033..41a565a638 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -18,28 +18,39 @@ concurrency: cancel-in-progress: true jobs: - unit-test: + unit-test-jobs: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner - + strategy: + matrix: + test_type: ['backend-0', 'backend-1', 'frontend'] steps: - - name: Checkout code - uses: actions/checkout@v3 + - name: Checkout code + uses: actions/checkout@v3 - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -e "python[dev]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[dev]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - name: Test Backend Runtime - timeout-minutes: 20 - run: | - cd test/srt - python3 run_suite.py --suite minimal + - name: Run test + timeout-minutes: 20 + run: | + if [ "${{ matrix.test_type }}" = "frontend" ]; then + cd test/lang + python3 run_suite.py --suite minimal + elif [ "${{ matrix.test_type }}" = "backend-0" ]; then + cd test/srt + python3 run_suite.py --suite minimal --range-begin 0 --range-end 8 + elif [ "${{ matrix.test_type }}" = "backend-1" ]; then + cd test/srt + python3 run_suite.py --suite minimal --range-begin 8 + fi - - name: Test Frontend Language - timeout-minutes: 10 - run: | - cd test/lang - python3 run_suite.py --suite minimal + unit-test: + needs: unit-test-jobs + runs-on: ubuntu-latest + steps: + - name: Merge step + run: echo "This is an empty merge step" \ No newline at end of file diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 59e2ab2924..d6a1792b85 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -465,7 +465,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float): def run_one_file(filename): filename = os.path.join(os.getcwd(), filename) - print(f"\n\nRun {filename}\n\n", flush=True) + print(f"\n\nRun:\npython3 {filename}\n\n", flush=True) process = subprocess.Popen( ["python3", filename], stdout=None, stderr=None, env=os.environ ) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 4e11d8da25..2351579f19 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -1,6 +1,5 @@ import argparse import glob -import multiprocessing as mp from sglang.test.test_utils import run_unittest_files @@ -49,6 +48,18 @@ choices=list(suites.keys()) + ["all"], help="The suite to run", ) + arg_parser.add_argument( + "--range-begin", + type=int, + default=0, + help="The begin index of the range of the files to run.", + ) + arg_parser.add_argument( + "--range-end", + type=int, + default=None, + help="The end index of the range of the files to run.", + ) args = arg_parser.parse_args() if args.suite == "all": @@ -56,5 +67,7 @@ else: files = suites[args.suite] + files = files[args.range_begin : args.range_end] + exit_code = run_unittest_files(files, args.timeout_per_file) exit(exit_code) From 75ce37f40139394bd2f3f55250095477d8c9b16d Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Mon, 26 Aug 2024 07:02:50 -0700 Subject: [PATCH 091/118] Move sampler into CUDA graph (#1201) Co-authored-by: Yineng Zhang --- python/sglang/srt/layers/logits_processor.py | 8 +- python/sglang/srt/layers/sampler.py | 83 +++++++++++++++---- python/sglang/srt/managers/schedule_batch.py | 28 +++++-- python/sglang/srt/managers/tp_worker.py | 52 +++++++----- .../srt/model_executor/cuda_graph_runner.py | 33 ++++++-- .../srt/model_executor/forward_batch_info.py | 9 +- .../sglang/srt/model_executor/model_runner.py | 14 +++- python/sglang/srt/models/chatglm.py | 16 +--- python/sglang/srt/models/commandr.py | 6 +- python/sglang/srt/models/dbrx.py | 6 +- python/sglang/srt/models/deepseek.py | 6 +- python/sglang/srt/models/deepseek_v2.py | 6 +- python/sglang/srt/models/gemma.py | 6 +- python/sglang/srt/models/gemma2.py | 6 +- python/sglang/srt/models/gpt_bigcode.py | 6 +- python/sglang/srt/models/grok.py | 6 +- python/sglang/srt/models/internlm2.py | 6 +- python/sglang/srt/models/llama2.py | 10 ++- .../sglang/srt/models/llama_classification.py | 4 +- python/sglang/srt/models/minicpm.py | 6 +- python/sglang/srt/models/mixtral.py | 6 +- python/sglang/srt/models/mixtral_quant.py | 6 +- python/sglang/srt/models/qwen.py | 7 +- python/sglang/srt/models/qwen2.py | 8 +- python/sglang/srt/models/qwen2_moe.py | 19 ++--- python/sglang/srt/models/stablelm.py | 6 +- .../srt/sampling/sampling_batch_info.py | 75 ++++++++++++++++- python/sglang/test/runners.py | 2 +- 28 files changed, 336 insertions(+), 110 deletions(-) diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index 63f74d8b02..b81f3d2a04 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -29,7 +29,7 @@ @dataclasses.dataclass -class LogitProcessorOutput: +class LogitsProcessorOutput: # The logits of the next tokens. shape: [#seq, vocab_size] next_token_logits: torch.Tensor # The logprobs of the next tokens. shape: [#seq, vocab_size] @@ -185,7 +185,7 @@ def forward( # Return only last_logits if logprob is not requested if not logits_metadata.return_logprob: - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=last_logits, next_token_logprobs=None, normalized_prompt_logprobs=None, @@ -209,7 +209,7 @@ def forward( else: output_top_logprobs = None - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, normalized_prompt_logprobs=None, @@ -278,7 +278,7 @@ def forward( # Remove the last token logprob for the prefill tokens. input_token_logprobs = input_token_logprobs[:-1] - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, normalized_prompt_logprobs=normalized_prompt_logprobs, diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py index 3006e765c8..6cb7d0a7c1 100644 --- a/python/sglang/srt/layers/sampler.py +++ b/python/sglang/srt/layers/sampler.py @@ -1,4 +1,6 @@ +import dataclasses import logging +from typing import Union import torch from flashinfer.sampling import ( @@ -9,6 +11,8 @@ ) from vllm.model_executor.custom_op import CustomOp +from sglang.srt.layers.logits_processor import LogitsProcessorOutput + # TODO: move this dict to another place from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo @@ -16,30 +20,71 @@ logger = logging.getLogger(__name__) +@dataclasses.dataclass +class SampleOutput: + success: torch.Tensor + probs: torch.Tensor + batch_next_token_ids: torch.Tensor + + class Sampler(CustomOp): def __init__(self): super().__init__() - def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): + def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): + # min-token, presence, frequency + if sampling_info.linear_penalties is not None: + logits += sampling_info.linear_penalties + + # repetition + if sampling_info.scaling_penalties is not None: + logits = torch.where( + logits > 0, + logits / sampling_info.scaling_penalties, + logits * sampling_info.scaling_penalties, + ) + + return logits + + def _get_probs( + self, + logits: torch.Tensor, + sampling_info: SamplingBatchInfo, + is_torch_compile: bool = False, + ): # Post process logits logits = logits.contiguous() logits.div_(sampling_info.temperatures) + if is_torch_compile: + # FIXME: Temporary workaround for unknown bugs in torch.compile + logits.add_(0) + if sampling_info.logit_bias is not None: logits.add_(sampling_info.logit_bias) if sampling_info.vocab_mask is not None: logits = logits.masked_fill(~sampling_info.vocab_mask, float("-inf")) - logits = sampling_info.penalizer_orchestrator.apply(logits) + logits = self._apply_penalties(logits, sampling_info) - probs = torch.softmax(logits, dim=-1) + return torch.softmax(logits, dim=-1) + + def forward_cuda( + self, + logits: Union[torch.Tensor, LogitsProcessorOutput], + sampling_info: SamplingBatchInfo, + ): + if isinstance(logits, LogitsProcessorOutput): + logits = logits.next_token_logits + + probs = self._get_probs(logits, sampling_info) if not global_server_args_dict["disable_flashinfer_sampling"]: max_top_k_round, batch_size = 32, probs.shape[0] uniform_samples = torch.rand( (max_top_k_round, batch_size), device=probs.device ) - if sampling_info.min_ps.any(): + if sampling_info.need_min_p_sampling: probs = top_k_renorm_prob(probs, sampling_info.top_ks) probs = top_p_renorm_prob(probs, sampling_info.top_ps) batch_next_token_ids, success = min_p_sampling_from_probs( @@ -55,18 +100,23 @@ def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps ) - if not torch.all(success): - logging.warning("Sampling failed, fallback to top_k=1 strategy") - probs = probs.masked_fill(torch.isnan(probs), 0.0) - argmax_ids = torch.argmax(probs, dim=-1) - batch_next_token_ids = torch.where( - success, batch_next_token_ids, argmax_ids - ) + return SampleOutput(success, probs, batch_next_token_ids) - return batch_next_token_ids + def forward_native( + self, + logits: Union[torch.Tensor, LogitsProcessorOutput], + sampling_info: SamplingBatchInfo, + ): + if isinstance(logits, LogitsProcessorOutput): + logits = logits.next_token_logits + + probs = self._get_probs(logits, sampling_info, is_torch_compile=True) + + batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch( + probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps + ) - def forward_native(): - raise NotImplementedError("Native forward is not implemented yet.") + return SampleOutput(success, probs, batch_next_token_ids) def top_k_top_p_min_p_sampling_from_probs_torch( @@ -87,7 +137,10 @@ def top_k_top_p_min_p_sampling_from_probs_torch( probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0 probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) try: - sampled_index = torch.multinomial(probs_sort, num_samples=1) + # FIXME: torch.multiomial does not support num_samples = 1 + sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[ + :, :1 + ] except RuntimeError as e: logger.warning(f"Sampling error: {e}") batch_next_token_ids = torch.zeros( diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index e61f13cb9d..dfd32dea9c 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1,3 +1,5 @@ +from __future__ import annotations + """ Copyright 2023-2024 SGLang Team Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,7 +19,7 @@ import logging from dataclasses import dataclass -from typing import List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union import torch @@ -29,6 +31,10 @@ from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo +if TYPE_CHECKING: + from sglang.srt.layers.sampler import SampleOutput + + INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 # Put some global args for easy access @@ -671,11 +677,17 @@ def merge(self, other: "ScheduleBatch"): self.top_logprobs_nums.extend(other.top_logprobs_nums) self.return_logprob = any(req.return_logprob for req in self.reqs) - def sample(self, logits: torch.Tensor): - from sglang.srt.layers.sampler import Sampler - - sampler = Sampler() - - batch_next_token_ids = sampler(logits, self.sampling_info) + def check_sample_results(self, sample_output: SampleOutput): + if not torch.all(sample_output.success): + probs = sample_output.probs + batch_next_token_ids = sample_output.batch_next_token_ids + logging.warning("Sampling failed, fallback to top_k=1 strategy") + probs = probs.masked_fill(torch.isnan(probs), 0.0) + argmax_ids = torch.argmax(probs, dim=-1) + batch_next_token_ids = torch.where( + sample_output.success, batch_next_token_ids, argmax_ids + ) + sample_output.probs = probs + sample_output.batch_next_token_ids = batch_next_token_ids - return batch_next_token_ids + return sample_output.batch_next_token_ids diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 9820e0302e..ddf20970e7 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -31,7 +31,7 @@ from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer -from sglang.srt.layers.logits_processor import LogitProcessorOutput +from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.managers.io_struct import ( AbortReq, BatchEmbeddingOut, @@ -486,21 +486,29 @@ def forward_prefill_batch(self, batch: ScheduleBatch): if self.model_runner.is_generation: # Forward and sample the next tokens if batch.extend_num_tokens != 0: - output = self.model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample(output.next_token_logits) + sample_output, logits_output = self.model_runner.forward( + batch, ForwardMode.EXTEND + ) + next_token_ids = batch.check_sample_results(sample_output) batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( next_token_ids ) # Move logprobs to cpu - if output.next_token_logprobs is not None: - output.next_token_logprobs = output.next_token_logprobs[ - torch.arange(len(next_token_ids), device=next_token_ids.device), - next_token_ids, - ].tolist() - output.input_token_logprobs = output.input_token_logprobs.tolist() - output.normalized_prompt_logprobs = ( - output.normalized_prompt_logprobs.tolist() + if logits_output.next_token_logprobs is not None: + logits_output.next_token_logprobs = ( + logits_output.next_token_logprobs[ + torch.arange( + len(next_token_ids), device=next_token_ids.device + ), + next_token_ids, + ].tolist() + ) + logits_output.input_token_logprobs = ( + logits_output.input_token_logprobs.tolist() + ) + logits_output.normalized_prompt_logprobs = ( + logits_output.normalized_prompt_logprobs.tolist() ) next_token_ids = next_token_ids.tolist() @@ -539,12 +547,14 @@ def forward_prefill_batch(self, batch: ScheduleBatch): self.req_to_token_pool.free(req.req_pool_idx) if req.return_logprob: - self.add_logprob_return_values(i, req, pt, next_token_ids, output) + self.add_logprob_return_values( + i, req, pt, next_token_ids, logits_output + ) pt += req.extend_input_len else: assert batch.extend_num_tokens != 0 - output = self.model_runner.forward(batch, ForwardMode.EXTEND) - embeddings = output.embeddings.tolist() + logits_output = self.model_runner.forward(batch, ForwardMode.EXTEND) + embeddings = logits_output.embeddings.tolist() # Check finish conditions for i, req in enumerate(batch.reqs): @@ -572,7 +582,7 @@ def add_logprob_return_values( req: Req, pt: int, next_token_ids: List[int], - output: LogitProcessorOutput, + output: LogitsProcessorOutput, ): if req.normalized_prompt_logprob is None: req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i] @@ -654,15 +664,17 @@ def forward_decode_batch(self, batch: ScheduleBatch): batch.prepare_for_decode() # Forward and sample the next tokens - output = self.model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample(output.next_token_logits) + sample_output, logits_output = self.model_runner.forward( + batch, ForwardMode.DECODE + ) + next_token_ids = batch.check_sample_results(sample_output) batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( next_token_ids ) # Move logprobs to cpu - if output.next_token_logprobs is not None: - next_token_logprobs = output.next_token_logprobs[ + if logits_output.next_token_logprobs is not None: + next_token_logprobs = logits_output.next_token_logprobs[ torch.arange(len(next_token_ids), device=next_token_ids.device), next_token_ids, ].tolist() @@ -688,7 +700,7 @@ def forward_decode_batch(self, batch: ScheduleBatch): (next_token_logprobs[i], next_token_id) ) if req.top_logprobs_num > 0: - req.output_top_logprobs.append(output.output_top_logprobs[i]) + req.output_top_logprobs.append(logits_output.output_top_logprobs[i]) self.handle_finished_requests(batch) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index d045be56d8..96c15849e4 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -25,16 +25,18 @@ from vllm.model_executor.custom_op import CustomOp from sglang.srt.layers.logits_processor import ( - LogitProcessorOutput, LogitsMetadata, LogitsProcessor, + LogitsProcessorOutput, ) +from sglang.srt.layers.sampler import SampleOutput from sglang.srt.managers.schedule_batch import ScheduleBatch from sglang.srt.model_executor.forward_batch_info import ( ForwardMode, InputMetadata, update_flashinfer_indices, ) +from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo from sglang.srt.utils import monkey_patch_vllm_all_gather @@ -143,6 +145,10 @@ def __init__( self.flashinfer_kv_indices.clone(), ] + # Sampling inputs + vocab_size = model_runner.model_config.vocab_size + self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size) + self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else [] if use_torch_compile: @@ -234,6 +240,7 @@ def capture_one_batch_size(self, bs, forward): def run_once(): input_metadata = InputMetadata( forward_mode=ForwardMode.DECODE, + sampling_info=self.sampling_info[:bs], batch_size=bs, req_pool_indices=req_pool_indices, seq_lens=seq_lens, @@ -298,27 +305,35 @@ def replay(self, batch: ScheduleBatch): self.flashinfer_handlers[bs], ) + # Sampling inputs + self.sampling_info.inplace_assign(raw_bs, batch.sampling_info) + # Replay torch.cuda.synchronize() self.graphs[bs].replay() torch.cuda.synchronize() - output = self.output_buffers[bs] + sample_output, logits_output = self.output_buffers[bs] # Unpad if bs != raw_bs: - output = LogitProcessorOutput( - next_token_logits=output.next_token_logits[:raw_bs], + logits_output = LogitsProcessorOutput( + next_token_logits=logits_output.next_token_logits[:raw_bs], next_token_logprobs=None, normalized_prompt_logprobs=None, input_token_logprobs=None, input_top_logprobs=None, output_top_logprobs=None, ) + sample_output = SampleOutput( + sample_output.success[:raw_bs], + sample_output.probs[:raw_bs], + sample_output.batch_next_token_ids[:raw_bs], + ) # Extract logprobs if batch.return_logprob: - output.next_token_logprobs = torch.nn.functional.log_softmax( - output.next_token_logits, dim=-1 + logits_output.next_token_logprobs = torch.nn.functional.log_softmax( + logits_output.next_token_logits, dim=-1 ) return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums) if return_top_logprob: @@ -326,8 +341,8 @@ def replay(self, batch: ScheduleBatch): forward_mode=ForwardMode.DECODE, top_logprobs_nums=batch.top_logprobs_nums, ) - output.output_top_logprobs = LogitsProcessor.get_top_logprobs( - output.next_token_logprobs, logits_metadata + logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs( + logits_output.next_token_logprobs, logits_metadata )[1] - return output + return sample_output, logits_output diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index c107b3bc82..e8849962b0 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -1,3 +1,5 @@ +from __future__ import annotations + """ Copyright 2023-2024 SGLang Team Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,7 +18,7 @@ """ModelRunner runs the forward passes of the models.""" from dataclasses import dataclass from enum import IntEnum, auto -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List import numpy as np import torch @@ -26,6 +28,7 @@ if TYPE_CHECKING: from sglang.srt.model_executor.model_runner import ModelRunner + from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo class ForwardMode(IntEnum): @@ -42,6 +45,7 @@ class InputMetadata: """Store all inforamtion of a forward pass.""" forward_mode: ForwardMode + sampling_info: SamplingBatchInfo batch_size: int req_pool_indices: torch.Tensor seq_lens: torch.Tensor @@ -179,6 +183,7 @@ def from_schedule_batch( ): ret = cls( forward_mode=forward_mode, + sampling_info=batch.sampling_info, batch_size=batch.batch_size(), req_pool_indices=batch.req_pool_indices, seq_lens=batch.seq_lens, @@ -189,6 +194,8 @@ def from_schedule_batch( top_logprobs_nums=batch.top_logprobs_nums, ) + ret.sampling_info.prepare_penalties() + ret.compute_positions(batch) ret.compute_extend_infos(batch) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fecfc2b430..eb6fe319f9 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -21,7 +21,7 @@ import logging import pkgutil from functools import lru_cache -from typing import Optional, Type +from typing import Optional, Tuple, Type import torch import torch.nn as nn @@ -44,6 +44,8 @@ from vllm.model_executor.models import ModelRegistry from sglang.global_config import global_config +from sglang.srt.layers.logits_processor import LogitsProcessorOutput +from sglang.srt.layers.sampler import SampleOutput from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict from sglang.srt.mem_cache.memory_pool import ( MHATokenToKVPool, @@ -514,7 +516,11 @@ def init_cuda_graphs(self): @torch.inference_mode() def forward_decode(self, batch: ScheduleBatch): - if self.cuda_graph_runner and self.cuda_graph_runner.can_run(len(batch.reqs)): + if ( + self.cuda_graph_runner + and self.cuda_graph_runner.can_run(len(batch.reqs)) + and not batch.sampling_info.has_bias() + ): return self.cuda_graph_runner.replay(batch) input_metadata = InputMetadata.from_schedule_batch( @@ -563,7 +569,9 @@ def forward_extend_multi_modal(self, batch: ScheduleBatch): input_metadata.image_offsets, ) - def forward(self, batch: ScheduleBatch, forward_mode: ForwardMode): + def forward( + self, batch: ScheduleBatch, forward_mode: ForwardMode + ) -> Tuple[SampleOutput, LogitsProcessorOutput]: if self.is_multimodal_model and forward_mode == ForwardMode.EXTEND: return self.forward_extend_multi_modal(batch) elif forward_mode == ForwardMode.DECODE: diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index 0a22f994bb..1c189eebbc 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -31,20 +31,18 @@ ) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata LoraConfig = None @@ -383,17 +381,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index f6d6f6e1f9..c360106f97 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -64,6 +64,7 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -326,6 +327,7 @@ def __init__( self.config = config self.quant_config = quant_config self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() self.model = CohereModel(config, quant_config) @torch.no_grad() @@ -340,9 +342,11 @@ def forward( positions, input_metadata, ) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 39ac4aefa7..b3a76b56ae 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -45,6 +45,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -382,6 +383,7 @@ def __init__( padding_size=DEFAULT_VOCAB_PADDING_SIZE, ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -391,9 +393,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): expert_params_mapping = [ diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index 59fd1ec7ed..b939602c1b 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -46,6 +46,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -385,6 +386,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -394,9 +396,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 13dd477392..15ecf4bb66 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -45,6 +45,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -632,6 +633,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() def forward( self, @@ -640,9 +642,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 990937f518..61cc5c66ea 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -37,6 +37,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -287,6 +288,7 @@ def __init__( self.quant_config = quant_config self.model = GemmaModel(config, quant_config=quant_config) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -297,9 +299,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return (sample_output, logits_output) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index c6dbc7e556..fabf86b498 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -41,6 +41,7 @@ from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -396,6 +397,7 @@ def __init__( self.quant_config = quant_config self.model = Gemma2Model(config, cache_config, quant_config) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -406,9 +408,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def get_attention_sliding_window_size(self): return get_attention_sliding_window_size(self.config) diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index 9a9e2aec3a..979d06886e 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -35,6 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -261,6 +262,7 @@ def __init__( if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -270,9 +272,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 4a0a08bf88..85a89ca3ed 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -46,6 +46,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -297,6 +298,7 @@ def __init__( self.model = Grok1Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() # Monkey patch _prepare_weights to load pre-sharded weights setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights) @@ -313,9 +315,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index f2947e991b..c0e4d19e12 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -40,6 +40,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -262,6 +263,7 @@ def __init__( self.model = InternLM2Model(config, quant_config) self.output = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -272,9 +274,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.output.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index 9de8d33c5c..42e9612303 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -39,8 +39,9 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.logits_processor import LogitProcessorOutput, LogitsProcessor +from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -302,6 +303,7 @@ def __init__( self.model = LlamaModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -310,11 +312,13 @@ def forward( positions: torch.Tensor, input_metadata: InputMetadata, input_embeds: torch.Tensor = None, - ) -> LogitProcessorOutput: + ) -> LogitsProcessorOutput: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def get_module_name(self, name): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index 02224971d6..fdf6d28e55 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.layers.logits_processor import LogitProcessorOutput +from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.model_executor.forward_batch_info import InputMetadata from sglang.srt.models.llama2 import LlamaModel @@ -65,7 +65,7 @@ def forward( (input_metadata.batch_size, self.config.classification_out_size) ).to(input_ids.device) - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=scores, next_token_logprobs=scores, normalized_prompt_logprobs=scores, diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index 49ff1926f3..0028ae67a8 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -39,6 +39,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -297,6 +298,7 @@ def __init__( self.scale_width = self.config.hidden_size / self.config.dim_model_base self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -314,9 +316,11 @@ def forward( lm_head_weight = self.model.embed_tokens.weight else: lm_head_weight = self.lm_head.weight - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, lm_head_weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index d11f6c9519..ca38cb03ba 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -41,6 +41,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -299,6 +300,7 @@ def __init__( self.model = MixtralModel(config, quant_config=quant_config, prefix="model") self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() def forward( self, @@ -308,9 +310,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index b02e925c5a..97ac09ee62 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -45,6 +45,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -333,6 +334,7 @@ def __init__( self.model = MixtralModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -343,9 +345,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 93dae9585c..4958a81298 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -39,6 +39,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -251,6 +252,7 @@ def __init__( vocab_size = ((config.vocab_size + 63) // 64) * 64 self.lm_head = ParallelLMHead(vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -260,10 +262,11 @@ def forward( input_metadata: InputMetadata, ): hidden_states = self.transformer(input_ids, positions, input_metadata) - next_tokens = self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - return next_tokens + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index fcf083e1b5..76094b907a 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -38,8 +38,9 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType +from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata Qwen2Config = None @@ -276,6 +277,7 @@ def __init__( self.model = Qwen2Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) @torch.no_grad() @@ -289,9 +291,11 @@ def forward( ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) if not get_embedding: - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output else: return self.pooler(hidden_states, input_metadata) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 9bdbd75066..e08695bc61 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -35,10 +35,8 @@ ReplicatedLinear, RowParallelLinear, ) -from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -49,6 +47,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -366,6 +365,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -376,20 +376,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - - def compute_logits( - self, - input_ids: torch.Tensor, - hidden_states: torch.Tensor, - input_metadata: InputMetadata, - ) -> torch.Tensor: - logits = self.logits_processor( - input_ids, hidden_states, self.lm_head.weight, input_metadata - ) - return logits + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 9e10f12f2a..a3102baabd 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -40,6 +40,7 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -249,6 +250,7 @@ def __init__( self.model = StableLMEpochModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -259,9 +261,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py index bc70a9018e..7843f4bd32 100644 --- a/python/sglang/srt/sampling/sampling_batch_info.py +++ b/python/sglang/srt/sampling/sampling_batch_info.py @@ -21,10 +21,63 @@ class SamplingBatchInfo: top_ps: torch.Tensor = None top_ks: torch.Tensor = None min_ps: torch.Tensor = None - penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None + + # Dispatch in CUDA graph + need_min_p_sampling: bool = False + + # Bias Tensors logit_bias: torch.Tensor = None vocab_mask: torch.Tensor = None + # Penalizer + penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None + linear_penalties: torch.Tensor = None + scaling_penalties: torch.Tensor = None + + def has_bias(self): + return ( + self.logit_bias is not None + or self.vocab_mask is not None + or self.linear_penalties is not None + or self.scaling_penalties is not None + ) + + @classmethod + def dummy_one(cls, max_bs: int, vocab_size: int): + ret = cls(vocab_size=vocab_size) + ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda") + ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda") + ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda") + ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda") + return ret + + def __getitem__(self, key): + if isinstance(key, slice): + # NOTE: We do not use cuda graph when there is bias tensors + assert not self.has_bias() + return SamplingBatchInfo( + vocab_size=self.vocab_size, + temperatures=self.temperatures[key], + top_ps=self.top_ps[key], + top_ks=self.top_ks[key], + min_ps=self.min_ps[key], + need_min_p_sampling=self.need_min_p_sampling, + ) + else: + raise NotImplementedError + + def inplace_assign(self, bs: int, other: SamplingBatchInfo): + # NOTE: We do not use cuda graph when there is bias tensors + assert not self.has_bias() + + self.vocab_size = other.vocab_size + self.need_min_p_sampling = other.need_min_p_sampling + + self.temperatures[:bs] = other.temperatures + self.top_ps[:bs] = other.top_ps + self.top_ks[:bs] = other.top_ks + self.min_ps[:bs] = other.min_ps + @classmethod def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): device = "cuda" @@ -45,6 +98,7 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): ret.min_ps = torch.tensor( [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device ) + ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs) # Each penalizers will do nothing if they evaluate themselves as not required by looking at # the sampling_params of the requests (See {_is_required()} of each penalizers). So this @@ -72,6 +126,25 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): return ret + def prepare_penalties(self): + self.scaling_penalties = None + self.linear_penalties = None + + for penalizer in self.penalizer_orchestrator.penalizers.values(): + if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer): + if penalizer.is_prepared(): + self.scaling_penalties = penalizer.cumulated_repetition_penalties + else: + if penalizer.is_prepared(): + if self.linear_penalties is None: + bs = self.penalizer_orchestrator.batch.batch_size() + self.linear_penalties = torch.zeros( + (bs, self.vocab_size), + dtype=torch.float32, + device="cuda", + ) + self.linear_penalties = penalizer.apply(self.linear_penalties) + def update_regex_vocab_mask(self, batch: ScheduleBatch): bs, reqs = batch.batch_size(), batch.reqs device = "cuda" diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 37ed2cf9ad..2d3b0aefa3 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -180,7 +180,7 @@ def __init__( tp_size=tp_size, dtype=get_dtype_str(torch_dtype), port=port, - mem_fraction_static=0.7, + mem_fraction_static=0.69, trust_remote_code=False, is_embedding=not self.is_generation, ) From c5fe11a8e175d48b00b32aafd7412953180314e4 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 27 Aug 2024 00:28:24 +1000 Subject: [PATCH 092/118] chore: bump v0.2.14 (#1155) --- README.md | 2 +- python/pyproject.toml | 4 ++-- python/sglang/srt/model_executor/model_runner.py | 7 +++---- python/sglang/version.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 651108f9e2..09e3d56869 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ### Method 2: From source ``` # Use the last release branch -git clone -b v0.2.13 https://github.com/sgl-project/sglang.git +git clone -b v0.2.14 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip diff --git a/python/pyproject.toml b/python/pyproject.toml index 4908ad051f..4a46adc3fe 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.2.13" +version = "0.2.14" description = "SGLang is yet another fast serving framework for large language models and vision language models." readme = "README.md" requires-python = ">=3.8" @@ -23,7 +23,7 @@ dependencies = [ srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow", "psutil", "pydantic", "python-multipart", "torch", "uvicorn", "uvloop", "zmq", - "vllm==0.5.4", "outlines>=0.0.44"] + "vllm==0.5.5", "outlines>=0.0.44"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index eb6fe319f9..0066061149 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -191,12 +191,11 @@ def load_model(self): self.model = get_model( model_config=self.vllm_model_config, - device_config=self.device_config, load_config=self.load_config, - lora_config=None, - multimodal_config=None, + device_config=self.device_config, parallel_config=None, scheduler_config=None, + lora_config=None, cache_config=None, ) self.sliding_window_size = ( @@ -627,4 +626,4 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]: # Monkey patch model loader -setattr(ModelRegistry, "load_model_cls", load_model_cls_srt) +setattr(ModelRegistry, "_try_load_model_cls", load_model_cls_srt) diff --git a/python/sglang/version.py b/python/sglang/version.py index 11ef092868..f3291e93b7 100644 --- a/python/sglang/version.py +++ b/python/sglang/version.py @@ -1 +1 @@ -__version__ = "0.2.13" +__version__ = "0.2.14" From 9935f97b3e594e246776466d04134decff1b59ae Mon Sep 17 00:00:00 2001 From: havetc Date: Mon, 26 Aug 2024 18:37:26 +0200 Subject: [PATCH 093/118] [FEAT] JSON constrained support (#1125) Co-authored-by: Yineng Zhang --- docs/en/sampling_params.md | 3 + python/sglang/srt/constrained/fsm_cache.py | 13 ++- python/sglang/srt/constrained/jump_forward.py | 1 + python/sglang/srt/managers/schedule_batch.py | 7 ++ python/sglang/srt/managers/tp_worker.py | 21 +++- python/sglang/srt/openai_api/adapter.py | 2 + python/sglang/srt/openai_api/protocol.py | 2 + python/sglang/srt/sampling/sampling_params.py | 4 + test/srt/run_suite.py | 1 + test/srt/test_json_constrained.py | 96 +++++++++++++++++++ 10 files changed, 147 insertions(+), 3 deletions(-) create mode 100644 test/srt/test_json_constrained.py diff --git a/docs/en/sampling_params.md b/docs/en/sampling_params.md index 54b03bf325..0e1c13e4bd 100644 --- a/docs/en/sampling_params.md +++ b/docs/en/sampling_params.md @@ -60,6 +60,9 @@ spaces_between_special_tokens: bool = True, regex: Optional[str] = None, # Do parallel sampling and return `n` outputs. n: int = 1, +# Constrains the output to follow a given JSON schema. +# `regex` and `json_schema` cannot be set at the same time. +json_schema: Optional[str] = None, ## Penalties. See [Performance Implications on Penalties] section below for more informations. diff --git a/python/sglang/srt/constrained/fsm_cache.py b/python/sglang/srt/constrained/fsm_cache.py index fa41f90de3..6bc6ea6d26 100644 --- a/python/sglang/srt/constrained/fsm_cache.py +++ b/python/sglang/srt/constrained/fsm_cache.py @@ -15,6 +15,8 @@ """Cache for the compressed finite state machine.""" +from outlines.fsm.json_schema import build_regex_from_schema + from sglang.srt.constrained import RegexGuide, TransformerTokenizer from sglang.srt.constrained.base_tool_cache import BaseToolCache @@ -26,9 +28,12 @@ def __init__( tokenizer_args_dict, enable=True, skip_tokenizer_init=False, + json_schema_mode=False, ): super().__init__(enable=enable) + self.json_schema_mode = json_schema_mode + if ( skip_tokenizer_init or tokenizer_path.endswith(".json") @@ -72,5 +77,9 @@ def fset(self, value): tokenizer_path, **tokenizer_args_dict ) - def init_value(self, regex): - return RegexGuide(regex, self.outlines_tokenizer) + def init_value(self, value): + if self.json_schema_mode: + regex = build_regex_from_schema(value) + return RegexGuide(regex, self.outlines_tokenizer), regex + else: + return RegexGuide(value, self.outlines_tokenizer) diff --git a/python/sglang/srt/constrained/jump_forward.py b/python/sglang/srt/constrained/jump_forward.py index b00c48d478..244931e050 100644 --- a/python/sglang/srt/constrained/jump_forward.py +++ b/python/sglang/srt/constrained/jump_forward.py @@ -23,6 +23,7 @@ import interegular import outlines.caching +from outlines.fsm.json_schema import build_regex_from_schema from sglang.srt.constrained import ( FSMInfo, diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index dfd32dea9c..cc180ba21b 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -268,7 +268,14 @@ def jump_forward_and_retokenize(self, jump_forward_str, next_state): all_text = self.origin_input_text + self.decoded_text + jump_forward_str all_ids = self.tokenizer.encode(all_text) + if not all_ids: + warnings.warn("Encoded all_text resulted in empty all_ids") + return False + prompt_tokens = len(self.origin_input_ids_unpadded) + if prompt_tokens > len(all_ids): + warnings.warn("prompt_tokens is larger than encoded all_ids") + return False if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]: # TODO(lsyin): fix token fusion diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index ddf20970e7..127f71900a 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -197,6 +197,16 @@ def __init__( "trust_remote_code": server_args.trust_remote_code, }, skip_tokenizer_init=server_args.skip_tokenizer_init, + json_schema_mode=False, + ) + self.json_fsm_cache = FSMCache( + server_args.tokenizer_path, + { + "tokenizer_mode": server_args.tokenizer_mode, + "trust_remote_code": server_args.trust_remote_code, + }, + skip_tokenizer_init=server_args.skip_tokenizer_init, + json_schema_mode=True, ) self.jump_forward_cache = JumpForwardCache() @@ -349,8 +359,17 @@ def handle_generate_request( req.top_logprobs_num = recv_req.top_logprobs_num req.stream = recv_req.stream + # Init regex fsm fron json + if req.sampling_params.json_schema is not None: + req.regex_fsm, computed_regex_string = self.json_fsm_cache.query( + req.sampling_params.json_schema + ) + if not self.disable_regex_jump_forward: + req.jump_forward_map = self.jump_forward_cache.query( + computed_regex_string + ) # Init regex fsm - if req.sampling_params.regex is not None: + elif req.sampling_params.regex is not None: req.regex_fsm = self.regex_fsm_cache.query(req.sampling_params.regex) if not self.disable_regex_jump_forward: req.jump_forward_map = self.jump_forward_cache.query( diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index f325e84b2f..148f2689d5 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -434,6 +434,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]): "frequency_penalty": request.frequency_penalty, "repetition_penalty": request.repetition_penalty, "regex": request.regex, + "json_schema": request.json_schema, "n": request.n, "ignore_eos": request.ignore_eos, } @@ -802,6 +803,7 @@ def v1_chat_generate_request( "frequency_penalty": request.frequency_penalty, "repetition_penalty": request.repetition_penalty, "regex": request.regex, + "json_schema": request.json_schema, "n": request.n, } ) diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 758e48edef..ce51e1c029 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -161,6 +161,7 @@ class CompletionRequest(BaseModel): # Extra parameters for SRT backend only and will be ignored by OpenAI models. regex: Optional[str] = None + json_schema: Optional[str] = None ignore_eos: Optional[bool] = False min_tokens: Optional[int] = 0 repetition_penalty: Optional[float] = 1.0 @@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel): # Extra parameters for SRT backend only and will be ignored by OpenAI models. regex: Optional[str] = None + json_schema: Optional[str] = None min_tokens: Optional[int] = 0 repetition_penalty: Optional[float] = 1.0 stop_token_ids: Optional[List[int]] = Field(default_factory=list) diff --git a/python/sglang/srt/sampling/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py index c30717dd7c..8111757d85 100644 --- a/python/sglang/srt/sampling/sampling_params.py +++ b/python/sglang/srt/sampling/sampling_params.py @@ -39,6 +39,7 @@ def __init__( spaces_between_special_tokens: bool = True, regex: Optional[str] = None, n: int = 1, + json_schema: Optional[str] = None, ) -> None: self.temperature = temperature self.top_p = top_p @@ -56,6 +57,7 @@ def __init__( self.spaces_between_special_tokens = spaces_between_special_tokens self.regex = regex self.n = n + self.json_schema = json_schema # Process some special cases if self.temperature < _SAMPLING_EPS: @@ -106,6 +108,8 @@ def verify(self): f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got " f"{self.min_new_tokens}." ) + if self.regex is not None and self.json_schema is not None: + raise ValueError("regex and json_schema cannot be both set.") def normalize(self, tokenizer): # Process stop strings diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 2351579f19..cafcf3f2d5 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -13,6 +13,7 @@ "test_eval_accuracy_mini.py", "test_large_max_new_tokens.py", "test_openai_server.py", + "test_json_constrained.py", "test_skip_tokenizer_init.py", "test_torch_compile.py", "test_triton_attn_backend.py", diff --git a/test/srt/test_json_constrained.py b/test/srt/test_json_constrained.py new file mode 100644 index 0000000000..5393ecc33c --- /dev/null +++ b/test/srt/test_json_constrained.py @@ -0,0 +1,96 @@ +import json +import unittest + +import openai +import requests + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestJSONConstrained(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.json_schema = json.dumps( + { + "type": "object", + "properties": { + "name": {"type": "string", "pattern": "^[\\w]+$"}, + "population": {"type": "integer"}, + }, + "required": ["name", "population"], + } + ) + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=300, api_key=cls.api_key + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): + headers = {"Authorization": f"Bearer {self.api_key}"} + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0 if n == 1 else 0.5, + "max_new_tokens": 128, + "n": n, + "stop_token_ids": [119690], + "json_schema": self.json_schema, + }, + "stream": False, + "return_logprob": return_logprob, + "top_logprobs_num": top_logprobs_num, + "logprob_start_len": 0, + }, + headers=headers, + ) + print(json.dumps(response.json())) + print("=" * 100) + try: + js_obj = json.loads(response.json()["text"]) + except (TypeError, json.decoder.JSONDecodeError): + raise + assert isinstance(js_obj["name"], str) + assert isinstance(js_obj["population"], int) + + def test_json_generate(self): + self.run_decode() + + def test_json_openai(self): + client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1") + + response = client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a helpful AI assistant"}, + {"role": "user", "content": "Introduce the capital of France."}, + ], + temperature=0, + max_tokens=128, + extra_body={"json_schema": self.json_schema}, + ) + text = response.choices[0].message.content + + try: + js_obj = json.loads(text) + except (TypeError, json.decoder.JSONDecodeError): + print("JSONDecodeError", text) + raise + assert isinstance(js_obj["name"], str) + assert isinstance(js_obj["population"], int) + + +if __name__ == "__main__": + unittest.main() From c61a1b6f97c61ebd80bada10c60c8ab75d2745b9 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Mon, 26 Aug 2024 13:52:58 -0700 Subject: [PATCH 094/118] Torch compile CI throughput test (#1223) --- test/srt/test_torch_compile.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 26daf4fa57..e8cafa15d2 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -1,6 +1,8 @@ import unittest from types import SimpleNamespace +import requests + from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( @@ -20,7 +22,7 @@ def setUpClass(cls): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--enable-torch-compile"], + other_args=["--enable-torch-compile", "--disable-radix-cache"], ) @classmethod @@ -39,6 +41,33 @@ def test_mmlu(self): metrics = run_eval(args) assert metrics["score"] >= 0.6 + def run_decode(self, max_new_tokens): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0, + "max_new_tokens": max_new_tokens, + }, + "ignore_eos": True, + }, + ) + return response.json() + + def test_throughput(self): + import time + + max_tokens = 256 + + tic = time.time() + res = self.run_decode(max_tokens) + tok = time.time() + print(res["text"]) + throughput = max_tokens / (tok - tic) + print(f"Throughput: {throughput} tokens/s") + assert throughput >= 152 + if __name__ == "__main__": unittest.main() From 2f1d92834f41df42e266ed6d7036b4add906d21f Mon Sep 17 00:00:00 2001 From: caiyueliang <393900414@qq.com> Date: Tue, 27 Aug 2024 07:28:26 +0800 Subject: [PATCH 095/118] [FEAT] Support batches cancel (#1222) Co-authored-by: Yineng Zhang --- python/sglang/srt/openai_api/adapter.py | 87 +++++++++++++++++++++++-- python/sglang/srt/server.py | 7 ++ test/srt/test_openai_server.py | 34 +++++++++- 3 files changed, 122 insertions(+), 6 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 148f2689d5..4feb632b0b 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -275,10 +275,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe end_point = batch_storage[batch_id].endpoint file_request_list = [] all_requests = [] + request_ids = [] for line in lines: request_data = json.loads(line) file_request_list.append(request_data) body = request_data["body"] + request_ids.append(request_data["custom_id"]) # Although streaming is supported for standalone completions, it is not supported in # batch mode (multiple completions in single request). @@ -289,12 +291,16 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe all_requests.append(ChatCompletionRequest(**body)) elif end_point == "/v1/completions": all_requests.append(CompletionRequest(**body)) + if end_point == "/v1/chat/completions": adapted_request, request = v1_chat_generate_request( - all_requests, tokenizer_manager + all_requests, tokenizer_manager, request_ids=request_ids ) elif end_point == "/v1/completions": - adapted_request, request = v1_generate_request(all_requests) + adapted_request, request = v1_generate_request( + all_requests, request_ids=request_ids + ) + try: ret = await tokenizer_manager.generate_request(adapted_request).__anext__() if not isinstance(ret, list): @@ -326,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe } all_ret.append(response_json) completed_requests += 1 + # Write results to a new file output_file_id = f"backend_result_file-{uuid.uuid4()}" global storage_dir @@ -372,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str): return batch_response +async def v1_cancel_batch(tokenizer_manager, batch_id: str): + # Retrieve the batch job from the in-memory storage + batch_response = batch_storage.get(batch_id) + if batch_response is None: + raise HTTPException(status_code=404, detail="Batch not found") + + # Only do cancal when status is "validating" or "in_progress" + if batch_response.status in ["validating", "in_progress"]: + # Start cancelling the batch asynchronously + asyncio.create_task( + cancel_batch( + tokenizer_manager=tokenizer_manager, + batch_id=batch_id, + input_file_id=batch_response.input_file_id, + ) + ) + + # Update batch status to "cancelling" + batch_response.status = "cancelling" + + return batch_response + else: + raise HTTPException( + status_code=500, + detail=f"Current status is {batch_response.status}, no need to cancel", + ) + + +async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str): + try: + # Update the batch status to "cancelling" + batch_storage[batch_id].status = "cancelling" + + # Retrieve the input file content + input_file_request = file_id_request.get(input_file_id) + if not input_file_request: + raise ValueError("Input file not found") + + # Parse the JSONL file and process each request + input_file_path = file_id_storage.get(input_file_id) + with open(input_file_path, "r", encoding="utf-8") as f: + lines = f.readlines() + + file_request_list = [] + request_ids = [] + for line in lines: + request_data = json.loads(line) + file_request_list.append(request_data) + request_ids.append(request_data["custom_id"]) + + # Cancel requests by request_ids + for rid in request_ids: + tokenizer_manager.abort_request(rid=rid) + + retrieve_batch = batch_storage[batch_id] + retrieve_batch.status = "cancelled" + + except Exception as e: + logger.error("error in SGLang:", e) + # Update batch status to "failed" + retrieve_batch = batch_storage[batch_id] + retrieve_batch.status = "failed" + retrieve_batch.failed_at = int(time.time()) + retrieve_batch.errors = {"message": str(e)} + + async def v1_retrieve_file(file_id: str): # Retrieve the batch job from the in-memory storage file_response = file_id_response.get(file_id) @@ -392,7 +465,9 @@ def iter_file(): return StreamingResponse(iter_file(), media_type="application/octet-stream") -def v1_generate_request(all_requests: List[CompletionRequest]): +def v1_generate_request( + all_requests: List[CompletionRequest], request_ids: List[str] = None +): prompts = [] sampling_params_list = [] return_logprobs = [] @@ -464,6 +539,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]): logprob_start_len=logprob_start_lens, return_text_in_logprobs=True, stream=all_requests[0].stream, + rid=request_ids, ) if len(all_requests) == 1: @@ -746,7 +822,9 @@ async def generate_stream_resp(): def v1_chat_generate_request( - all_requests: List[ChatCompletionRequest], tokenizer_manager + all_requests: List[ChatCompletionRequest], + tokenizer_manager, + request_ids: List[str] = None, ): input_ids = [] sampling_params_list = [] @@ -834,6 +912,7 @@ def v1_chat_generate_request( top_logprobs_num=top_logprobs_nums, stream=all_requests[0].stream, return_text_in_logprobs=True, + rid=request_ids, ) if len(all_requests) == 1: return adapted_request, all_requests[0] diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 021f231aa7..6d1fc9fda7 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -59,6 +59,7 @@ from sglang.srt.openai_api.adapter import ( load_chat_template_for_openai_api, v1_batches, + v1_cancel_batch, v1_chat_completions, v1_completions, v1_delete_file, @@ -246,6 +247,12 @@ async def openai_v1_batches(raw_request: Request): return await v1_batches(tokenizer_manager, raw_request) +@app.post("/v1/batches/{batch_id}/cancel") +async def cancel_batches(batch_id: str): + # https://platform.openai.com/docs/api-reference/batch/cancel + return await v1_cancel_batch(tokenizer_manager, batch_id) + + @app.get("/v1/batches/{batch_id}") async def retrieve_batch(batch_id: str): return await v1_retrieve_batch(batch_id) diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index ce130956de..cfc65b7e6a 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -256,8 +256,7 @@ def run_chat_completion_stream(self, logprobs, parallel_sample_num=1): index, True ), f"index {index} is not found in the response" - def run_batch(self, mode): - client = openai.Client(api_key=self.api_key, base_url=self.base_url) + def _create_batch(self, mode, client): if mode == "completion": input_file_path = "complete_input.jsonl" # write content to input file @@ -333,9 +332,11 @@ def run_batch(self, mode): }, }, ] + with open(input_file_path, "w") as file: for line in content: file.write(json.dumps(line) + "\n") + with open(input_file_path, "rb") as file: uploaded_file = client.files.create(file=file, purpose="batch") if mode == "completion": @@ -348,6 +349,13 @@ def run_batch(self, mode): endpoint=endpoint, completion_window=completion_window, ) + + return batch_job, content + + def run_batch(self, mode): + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + batch_job, content = self._create_batch(mode=mode, client=client) + while batch_job.status not in ["completed", "failed", "cancelled"]: time.sleep(3) print( @@ -371,6 +379,24 @@ def run_batch(self, mode): ] assert len(results) == len(content) + def run_cancel_batch(self, mode): + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + batch_job, _ = self._create_batch(mode=mode, client=client) + + assert batch_job.status not in ["cancelling", "cancelled"] + + batch_job = client.batches.cancel(batch_id=batch_job.id) + assert batch_job.status == "cancelling" + + while batch_job.status not in ["failed", "cancelled"]: + batch_job = client.batches.retrieve(batch_job.id) + print( + f"Batch job status: {batch_job.status}...trying again in 3 seconds..." + ) + time.sleep(3) + + assert batch_job.status == "cancelled" + def test_completion(self): for echo in [False, True]: for logprobs in [None, 5]: @@ -414,6 +440,10 @@ def test_batch(self): for mode in ["completion", "chat"]: self.run_batch(mode) + def test_calcel_batch(self): + for mode in ["completion", "chat"]: + self.run_cancel_batch(mode) + def test_regex(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) From 5ff25cdf5b1310e83d9e595142b39ae4d7b561e9 Mon Sep 17 00:00:00 2001 From: yichuan~ <73766326+yichuan520030910320@users.noreply.github.com> Date: Mon, 26 Aug 2024 22:04:52 -0700 Subject: [PATCH 096/118] [Minor] add delete test and delete tmp file on ci server (#1227) --- .../usage/openai_parallel_sample.py | 153 ------------------ test/srt/test_openai_server.py | 11 +- 2 files changed, 8 insertions(+), 156 deletions(-) delete mode 100644 examples/frontend_language/usage/openai_parallel_sample.py diff --git a/examples/frontend_language/usage/openai_parallel_sample.py b/examples/frontend_language/usage/openai_parallel_sample.py deleted file mode 100644 index 753e66c744..0000000000 --- a/examples/frontend_language/usage/openai_parallel_sample.py +++ /dev/null @@ -1,153 +0,0 @@ -import openai - -client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY") - -# Text completion -response = client.completions.create( - model="default", - prompt="I am a robot and I want to study like humans. Now let's tell a story. Once upon a time, there was a little", - n=1, - temperature=0.8, - max_tokens=32, -) -print(response) - - -# Text completion -response = client.completions.create( - model="default", - prompt="I am a robot and I want to study like humans. Now let's tell a story. Once upon a time, there was a little", - n=5, - temperature=0.8, - max_tokens=320, -) -print(response) - - -# Text completion -response = client.completions.create( - model="default", - prompt="I am a robot and I want to study like humans. Now let's tell a story. Once upon a time, there was a little", - n=3, - temperature=0.8, - max_tokens=32, -) -print(response) - - -# Text completion -response = client.completions.create( - model="default", - prompt=["The name of the famous soccer player is"], - n=1, - temperature=0.8, - max_tokens=128, -) -print(response) - - -# Text completion -response = client.completions.create( - model="default", - prompt=["The name of the famous soccer player is ", "The capital of US is"], - n=1, - temperature=0.8, - max_tokens=32, -) -print(response) - - -# Text completion -response = client.completions.create( - model="default", - prompt=["The name of the famous soccer player is ", "The capital of US is"], - n=3, - temperature=0.8, - max_tokens=32, -) -print(response) - - -response = client.completions.create( - model="default", - prompt=[ - "prompt1: I am a robot and I want to learn like humans. Now let's begin a tale. Once upon a time, there was a small", - "prompt2: As a robot, my goal is to understand human learning. Let's start a story. In a faraway land, there lived a tiny", - "prompt3: Being a robot, I aspire to study like people. Let's share a story. Long ago, there was a little", - "prompt4: I am a robot aiming to learn like humans. Let's narrate a story. Once, in a distant kingdom, there was a young", - "prompt5: As a robot, I seek to learn in human ways. Let's tell a story. Once upon a time, in a small village, there was a young", - ], - n=1, - temperature=0.8, - max_tokens=320, -) -print(response) - - -# Text completion -response = client.completions.create( - model="default", - prompt=[ - "The capital of France is", - "The capital of Germany is", - "The capital of US is", - ], - n=3, - temperature=0.8, - max_tokens=32, -) -print(response) - -# Chat completion -response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0.8, - max_tokens=1, - logprobs=True, - top_logprobs=3, -) -print(response) - -# Chat completion -response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0.8, - max_tokens=1, - n=1, -) -print(response) - -# Chat completion -response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0.8, - max_tokens=1, - logprobs=True, - top_logprobs=3, -) -print(response) - -# Chat completion -response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0.8, - max_tokens=1, - n=4, -) -print(response) diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index cfc65b7e6a..3fc5785517 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -350,11 +350,11 @@ def _create_batch(self, mode, client): completion_window=completion_window, ) - return batch_job, content + return batch_job, content, uploaded_file def run_batch(self, mode): client = openai.Client(api_key=self.api_key, base_url=self.base_url) - batch_job, content = self._create_batch(mode=mode, client=client) + batch_job, content, uploaded_file = self._create_batch(mode=mode, client=client) while batch_job.status not in ["completed", "failed", "cancelled"]: time.sleep(3) @@ -378,10 +378,13 @@ def run_batch(self, mode): if line.strip() != "" ] assert len(results) == len(content) + for delete_fid in [uploaded_file.id, result_file_id]: + del_pesponse = client.files.delete(delete_fid) + assert del_pesponse.deleted def run_cancel_batch(self, mode): client = openai.Client(api_key=self.api_key, base_url=self.base_url) - batch_job, _ = self._create_batch(mode=mode, client=client) + batch_job, _, uploaded_file = self._create_batch(mode=mode, client=client) assert batch_job.status not in ["cancelling", "cancelled"] @@ -396,6 +399,8 @@ def run_cancel_batch(self, mode): time.sleep(3) assert batch_job.status == "cancelled" + del_response = client.files.delete(uploaded_file.id) + assert del_response.deleted def test_completion(self): for echo in [False, True]: From 909f34363bf551711c20dbadbd5cc7fb6517a614 Mon Sep 17 00:00:00 2001 From: havetc Date: Tue, 27 Aug 2024 12:10:46 +0200 Subject: [PATCH 097/118] [FIX] Wrong logger (#1230) --- python/sglang/srt/managers/schedule_batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index cc180ba21b..926266a628 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -269,12 +269,12 @@ def jump_forward_and_retokenize(self, jump_forward_str, next_state): all_text = self.origin_input_text + self.decoded_text + jump_forward_str all_ids = self.tokenizer.encode(all_text) if not all_ids: - warnings.warn("Encoded all_text resulted in empty all_ids") + logger.warning("Encoded all_text resulted in empty all_ids") return False prompt_tokens = len(self.origin_input_ids_unpadded) if prompt_tokens > len(all_ids): - warnings.warn("prompt_tokens is larger than encoded all_ids") + logger.warning("prompt_tokens is larger than encoded all_ids") return False if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]: From 3602692c7ca7c3757cc3d2b5dfc829209205731a Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 27 Aug 2024 21:15:31 +1000 Subject: [PATCH 098/118] feat: replace get_act_fn for gpt_bigcode (#1231) --- python/sglang/srt/layers/activation.py | 83 +++++++++++++++++++++++++ python/sglang/srt/models/gpt_bigcode.py | 2 +- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index d0e0626604..832d2b7013 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -13,10 +13,20 @@ """Fused operators for activation layers.""" +from typing import Optional + import torch +import torch.nn as nn import torch.nn.functional as F from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.utils import set_weight_attrs class SiluAndMul(CustomOp): @@ -53,3 +63,76 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: out = torch.empty(output_shape, dtype=x.dtype, device=x.device) gelu_tanh_and_mul(x, out) return out + + +class ScaledActivation(nn.Module): + """An activation function with post-scale parameters. + + This is used for some quantization methods like AWQ. + """ + + def __init__( + self, + act_module: nn.Module, + intermediate_size: int, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, + ): + super().__init__() + self.act = act_module + self.input_is_parallel = input_is_parallel + if input_is_parallel: + tp_size = get_tensor_model_parallel_world_size() + intermediate_size_per_partition = divide(intermediate_size, tp_size) + else: + intermediate_size_per_partition = intermediate_size + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.scales = nn.Parameter( + torch.empty(intermediate_size_per_partition, dtype=params_dtype) + ) + set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.act(x) / self.scales + + def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): + param_data = param.data + if self.input_is_parallel: + tp_rank = get_tensor_model_parallel_rank() + shard_size = param_data.shape[0] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +_ACTIVATION_REGISTRY = { + "gelu": nn.GELU(), + "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), +} + + +def get_act_fn( + act_fn_name: str, + quant_config: Optional[QuantizationConfig] = None, + intermediate_size: Optional[int] = None, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, +) -> nn.Module: + """Get an activation function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_REGISTRY: + raise ValueError(f"Activation function {act_fn_name!r} is not supported.") + + act_fn = _ACTIVATION_REGISTRY[act_fn_name] + if quant_config is not None and act_fn_name in quant_config.get_scaled_act_names(): + if intermediate_size is None: + raise ValueError( + "intermediate_size must be specified for scaled " + "activation functions." + ) + return ScaledActivation( + act_fn, intermediate_size, input_is_parallel, params_dtype + ) + return act_fn diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index 979d06886e..dc828f0142 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -23,7 +23,6 @@ from transformers import GPTBigCodeConfig from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -33,6 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.sampler import Sampler From c8a9e79186503c3bd1955cdbd4c364b04db333fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dr=2E=20Artificial=E6=9B=BE=E5=B0=8F=E5=81=A5?= <875100501@qq.com> Date: Wed, 28 Aug 2024 14:51:41 +0800 Subject: [PATCH 099/118] Fix readme (#1236) --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 09e3d56869..3f03fd7f1c 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ docker run --gpus all \ ### Method 4: Using docker compose
+More > This method is recommended if you plan to serve it as a service. > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml). @@ -94,6 +95,7 @@ docker run --gpus all \ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
+More To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot). @@ -262,6 +264,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec #### Use Models From ModelScope
+More To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE. ``` @@ -276,6 +279,7 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen #### Run Llama 3.1 405B
+More ```bash # Run 405B (fp8) on a single node From 1ece2cda3dde1df62c924c0288ec514f5c5e2af5 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Wed, 28 Aug 2024 00:37:32 -0700 Subject: [PATCH 100/118] Fix bench latency benchmark (#1225) --- .github/workflows/e2e-test.yml | 5 +++++ python/sglang/bench_latency.py | 10 ++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 7f555110d9..11c94775c1 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -38,6 +38,11 @@ jobs: cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default + - name: Benchmark Serving Latency + timeout-minutes: 10 + run: | + python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 + - name: Benchmark Serving Throughput (w/o RadixAttention) timeout-minutes: 10 run: | diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index dea910f577..6a918fbd11 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -200,16 +200,14 @@ def extend(reqs, model_runner): tree_cache=None, ) batch.prepare_for_extend(model_runner.model_config.vocab_size) - output = model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample(output.next_token_logits) - return next_token_ids, output.next_token_logits, batch + sample_output, logits_output = model_runner.forward(batch, ForwardMode.EXTEND) + return sample_output.batch_next_token_ids, logits_output.next_token_logits, batch def decode(input_token_ids, batch, model_runner): batch.prepare_for_decode(input_token_ids.cpu().numpy()) - output = model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample(output.next_token_logits) - return next_token_ids, output.next_token_logits + sample_output, logits_output = model_runner.forward(batch, ForwardMode.DECODE) + return sample_output.batch_next_token_ids, logits_output.next_token_logits @torch.inference_mode() From 6cc38b2bf31c141e3ae06ca8c1150e35dbeb5578 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 28 Aug 2024 00:54:26 -0700 Subject: [PATCH 101/118] [Minor] Add more type annotations (#1237) --- .../srt/model_executor/cuda_graph_runner.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 96c15849e4..40c87af88c 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -17,6 +17,7 @@ import bisect from contextlib import contextmanager +from typing import Callable, List import torch from flashinfer import BatchDecodeWithPagedKVCacheWrapper @@ -53,12 +54,12 @@ def _to_torch(model: torch.nn.Module, reverse: bool = False): @contextmanager def patch_model( - model: torch.nn.Module, use_compile: bool, tp_group: "GroupCoordinator" + model: torch.nn.Module, enable_compile: bool, tp_group: "GroupCoordinator" ): backup_ca_comm = None try: - if use_compile: + if enable_compile: _to_torch(model) monkey_patch_vllm_all_gather() backup_ca_comm = tp_group.ca_comm @@ -67,7 +68,7 @@ def patch_model( else: yield model.forward finally: - if use_compile: + if enable_compile: _to_torch(model, reverse=True) monkey_patch_vllm_all_gather(reverse=True) tp_group.ca_comm = backup_ca_comm @@ -88,7 +89,7 @@ def set_torch_compile_config(): class CudaGraphRunner: def __init__( self, - model_runner, + model_runner: "ModelRunner", max_batch_size_to_capture: int, use_torch_compile: bool, disable_padding: bool, @@ -154,13 +155,13 @@ def __init__( if use_torch_compile: set_torch_compile_config() - def can_run(self, batch_size): + def can_run(self, batch_size: int): if self.disable_padding: return batch_size in self.graphs else: return batch_size <= self.max_bs - def capture(self, batch_size_list): + def capture(self, batch_size_list: List[int]): self.batch_size_list = batch_size_list with graph_capture() as graph_capture_context: self.stream = graph_capture_context.stream @@ -181,7 +182,7 @@ def capture(self, batch_size_list): self.output_buffers[bs] = output_buffers self.flashinfer_handlers[bs] = flashinfer_handler - def capture_one_batch_size(self, bs, forward): + def capture_one_batch_size(self, bs: int, forward: Callable): graph = torch.cuda.CUDAGraph() stream = self.stream From 198974cd1a805a7fab2d81fe9e6b5fbd73d03fb8 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 28 Aug 2024 18:39:12 +1000 Subject: [PATCH 102/118] feat: support sm75 with FlashInfer v0.1.6 (#1233) --- README.md | 2 +- python/sglang/srt/layers/activation.py | 7 ------- python/sglang/srt/layers/layernorm.py | 3 --- python/sglang/srt/model_executor/model_runner.py | 2 ++ python/sglang/srt/server.py | 2 +- 5 files changed, 4 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3f03fd7f1c..efb69c0437 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ sky status --endpoint 30000 sglang ### Common Notes -- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue. +- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue. - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. ## Backend: SGLang Runtime (SRT) diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 832d2b7013..5df387cb2b 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -30,18 +30,11 @@ class SiluAndMul(CustomOp): - def __init__(self, **kwargs): - super().__init__() - self.is_lower_sm80 = torch.cuda.get_device_capability()[0] < 8 - def forward_native(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 return F.silu(x[..., :d]) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - if self.is_lower_sm80: - return self.forward_native(x) - d = x.shape[-1] // 2 output_shape = x.shape[:-1] + (d,) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index 6cea85404a..ac4d368d3f 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -32,15 +32,12 @@ def __init__( super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - self.is_lower_sm80 = torch.cuda.get_device_capability()[0] < 8 def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if self.is_lower_sm80: - return self.forward_native(x, residual) if residual is not None: fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0066061149..0dd9f8c201 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -161,6 +161,8 @@ def load_model(self): "Compute capability below sm80. Use float16 due to lack of bfloat16 support." ) self.server_args.dtype = "float16" + if torch.cuda.get_device_capability()[1] < 5: + raise RuntimeError("SGLang only supports sm75 and above.") monkey_patch_vllm_dummy_weight_loader() self.device_config = DeviceConfig() diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 6d1fc9fda7..f3d1ab0f94 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -421,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs): if not server_args.disable_flashinfer: assert_pkg_version( "flashinfer", - "0.1.5", + "0.1.6", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", From 184ae1c68316c58a7f5b4ad813639b08604369f5 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 28 Aug 2024 02:15:52 -0700 Subject: [PATCH 103/118] Update README.md (#1239) --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index efb69c0437..305df444d0 100644 --- a/README.md +++ b/README.md @@ -297,7 +297,9 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/ ### Benchmark Performance -- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`. +- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. + Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. + A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead. ``` python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32 ``` From f25f4dfde5af9a81be52c1ba6d99cc2ac5cca179 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 28 Aug 2024 21:16:47 +1000 Subject: [PATCH 104/118] hotfix: revert sampler CUDA Graph (#1242) --- .github/workflows/e2e-test.yml | 5 -- README.md | 2 +- python/pyproject.toml | 2 +- python/sglang/bench_latency.py | 10 ++- python/sglang/srt/layers/logits_processor.py | 8 +- python/sglang/srt/layers/sampler.py | 83 ++++--------------- python/sglang/srt/managers/schedule_batch.py | 28 ++----- python/sglang/srt/managers/tp_worker.py | 52 +++++------- .../srt/model_executor/cuda_graph_runner.py | 33 ++------ .../srt/model_executor/forward_batch_info.py | 9 +- .../sglang/srt/model_executor/model_runner.py | 14 +--- python/sglang/srt/models/chatglm.py | 16 +++- python/sglang/srt/models/commandr.py | 6 +- python/sglang/srt/models/dbrx.py | 6 +- python/sglang/srt/models/deepseek.py | 6 +- python/sglang/srt/models/deepseek_v2.py | 6 +- python/sglang/srt/models/gemma.py | 6 +- python/sglang/srt/models/gemma2.py | 6 +- python/sglang/srt/models/gpt_bigcode.py | 6 +- python/sglang/srt/models/grok.py | 6 +- python/sglang/srt/models/internlm2.py | 6 +- python/sglang/srt/models/llama2.py | 10 +-- .../sglang/srt/models/llama_classification.py | 4 +- python/sglang/srt/models/minicpm.py | 6 +- python/sglang/srt/models/mixtral.py | 6 +- python/sglang/srt/models/mixtral_quant.py | 6 +- python/sglang/srt/models/qwen.py | 7 +- python/sglang/srt/models/qwen2.py | 8 +- python/sglang/srt/models/qwen2_moe.py | 19 +++-- python/sglang/srt/models/stablelm.py | 6 +- .../srt/sampling/sampling_batch_info.py | 75 +---------------- python/sglang/test/runners.py | 2 +- python/sglang/version.py | 2 +- 33 files changed, 119 insertions(+), 348 deletions(-) diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 11c94775c1..7f555110d9 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -38,11 +38,6 @@ jobs: cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default - - name: Benchmark Serving Latency - timeout-minutes: 10 - run: | - python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 - - name: Benchmark Serving Throughput (w/o RadixAttention) timeout-minutes: 10 run: | diff --git a/README.md b/README.md index 305df444d0..223f9624f6 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ### Method 2: From source ``` # Use the last release branch -git clone -b v0.2.14 https://github.com/sgl-project/sglang.git +git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip diff --git a/python/pyproject.toml b/python/pyproject.toml index 4a46adc3fe..7b2741fd21 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.2.14" +version = "0.2.14.post1" description = "SGLang is yet another fast serving framework for large language models and vision language models." readme = "README.md" requires-python = ">=3.8" diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index 6a918fbd11..dea910f577 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -200,14 +200,16 @@ def extend(reqs, model_runner): tree_cache=None, ) batch.prepare_for_extend(model_runner.model_config.vocab_size) - sample_output, logits_output = model_runner.forward(batch, ForwardMode.EXTEND) - return sample_output.batch_next_token_ids, logits_output.next_token_logits, batch + output = model_runner.forward(batch, ForwardMode.EXTEND) + next_token_ids = batch.sample(output.next_token_logits) + return next_token_ids, output.next_token_logits, batch def decode(input_token_ids, batch, model_runner): batch.prepare_for_decode(input_token_ids.cpu().numpy()) - sample_output, logits_output = model_runner.forward(batch, ForwardMode.DECODE) - return sample_output.batch_next_token_ids, logits_output.next_token_logits + output = model_runner.forward(batch, ForwardMode.DECODE) + next_token_ids = batch.sample(output.next_token_logits) + return next_token_ids, output.next_token_logits @torch.inference_mode() diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index b81f3d2a04..63f74d8b02 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -29,7 +29,7 @@ @dataclasses.dataclass -class LogitsProcessorOutput: +class LogitProcessorOutput: # The logits of the next tokens. shape: [#seq, vocab_size] next_token_logits: torch.Tensor # The logprobs of the next tokens. shape: [#seq, vocab_size] @@ -185,7 +185,7 @@ def forward( # Return only last_logits if logprob is not requested if not logits_metadata.return_logprob: - return LogitsProcessorOutput( + return LogitProcessorOutput( next_token_logits=last_logits, next_token_logprobs=None, normalized_prompt_logprobs=None, @@ -209,7 +209,7 @@ def forward( else: output_top_logprobs = None - return LogitsProcessorOutput( + return LogitProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, normalized_prompt_logprobs=None, @@ -278,7 +278,7 @@ def forward( # Remove the last token logprob for the prefill tokens. input_token_logprobs = input_token_logprobs[:-1] - return LogitsProcessorOutput( + return LogitProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, normalized_prompt_logprobs=normalized_prompt_logprobs, diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py index 6cb7d0a7c1..3006e765c8 100644 --- a/python/sglang/srt/layers/sampler.py +++ b/python/sglang/srt/layers/sampler.py @@ -1,6 +1,4 @@ -import dataclasses import logging -from typing import Union import torch from flashinfer.sampling import ( @@ -11,8 +9,6 @@ ) from vllm.model_executor.custom_op import CustomOp -from sglang.srt.layers.logits_processor import LogitsProcessorOutput - # TODO: move this dict to another place from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo @@ -20,71 +16,30 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class SampleOutput: - success: torch.Tensor - probs: torch.Tensor - batch_next_token_ids: torch.Tensor - - class Sampler(CustomOp): def __init__(self): super().__init__() - def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): - # min-token, presence, frequency - if sampling_info.linear_penalties is not None: - logits += sampling_info.linear_penalties - - # repetition - if sampling_info.scaling_penalties is not None: - logits = torch.where( - logits > 0, - logits / sampling_info.scaling_penalties, - logits * sampling_info.scaling_penalties, - ) - - return logits - - def _get_probs( - self, - logits: torch.Tensor, - sampling_info: SamplingBatchInfo, - is_torch_compile: bool = False, - ): + def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): # Post process logits logits = logits.contiguous() logits.div_(sampling_info.temperatures) - if is_torch_compile: - # FIXME: Temporary workaround for unknown bugs in torch.compile - logits.add_(0) - if sampling_info.logit_bias is not None: logits.add_(sampling_info.logit_bias) if sampling_info.vocab_mask is not None: logits = logits.masked_fill(~sampling_info.vocab_mask, float("-inf")) - logits = self._apply_penalties(logits, sampling_info) + logits = sampling_info.penalizer_orchestrator.apply(logits) - return torch.softmax(logits, dim=-1) - - def forward_cuda( - self, - logits: Union[torch.Tensor, LogitsProcessorOutput], - sampling_info: SamplingBatchInfo, - ): - if isinstance(logits, LogitsProcessorOutput): - logits = logits.next_token_logits - - probs = self._get_probs(logits, sampling_info) + probs = torch.softmax(logits, dim=-1) if not global_server_args_dict["disable_flashinfer_sampling"]: max_top_k_round, batch_size = 32, probs.shape[0] uniform_samples = torch.rand( (max_top_k_round, batch_size), device=probs.device ) - if sampling_info.need_min_p_sampling: + if sampling_info.min_ps.any(): probs = top_k_renorm_prob(probs, sampling_info.top_ks) probs = top_p_renorm_prob(probs, sampling_info.top_ps) batch_next_token_ids, success = min_p_sampling_from_probs( @@ -100,23 +55,18 @@ def forward_cuda( probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps ) - return SampleOutput(success, probs, batch_next_token_ids) - - def forward_native( - self, - logits: Union[torch.Tensor, LogitsProcessorOutput], - sampling_info: SamplingBatchInfo, - ): - if isinstance(logits, LogitsProcessorOutput): - logits = logits.next_token_logits - - probs = self._get_probs(logits, sampling_info, is_torch_compile=True) + if not torch.all(success): + logging.warning("Sampling failed, fallback to top_k=1 strategy") + probs = probs.masked_fill(torch.isnan(probs), 0.0) + argmax_ids = torch.argmax(probs, dim=-1) + batch_next_token_ids = torch.where( + success, batch_next_token_ids, argmax_ids + ) - batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch( - probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps - ) + return batch_next_token_ids - return SampleOutput(success, probs, batch_next_token_ids) + def forward_native(): + raise NotImplementedError("Native forward is not implemented yet.") def top_k_top_p_min_p_sampling_from_probs_torch( @@ -137,10 +87,7 @@ def top_k_top_p_min_p_sampling_from_probs_torch( probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0 probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) try: - # FIXME: torch.multiomial does not support num_samples = 1 - sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[ - :, :1 - ] + sampled_index = torch.multinomial(probs_sort, num_samples=1) except RuntimeError as e: logger.warning(f"Sampling error: {e}") batch_next_token_ids = torch.zeros( diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 926266a628..f3af821e4e 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1,5 +1,3 @@ -from __future__ import annotations - """ Copyright 2023-2024 SGLang Team Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +17,7 @@ import logging from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List, Optional, Union import torch @@ -31,10 +29,6 @@ from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo -if TYPE_CHECKING: - from sglang.srt.layers.sampler import SampleOutput - - INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 # Put some global args for easy access @@ -684,17 +678,11 @@ def merge(self, other: "ScheduleBatch"): self.top_logprobs_nums.extend(other.top_logprobs_nums) self.return_logprob = any(req.return_logprob for req in self.reqs) - def check_sample_results(self, sample_output: SampleOutput): - if not torch.all(sample_output.success): - probs = sample_output.probs - batch_next_token_ids = sample_output.batch_next_token_ids - logging.warning("Sampling failed, fallback to top_k=1 strategy") - probs = probs.masked_fill(torch.isnan(probs), 0.0) - argmax_ids = torch.argmax(probs, dim=-1) - batch_next_token_ids = torch.where( - sample_output.success, batch_next_token_ids, argmax_ids - ) - sample_output.probs = probs - sample_output.batch_next_token_ids = batch_next_token_ids + def sample(self, logits: torch.Tensor): + from sglang.srt.layers.sampler import Sampler + + sampler = Sampler() + + batch_next_token_ids = sampler(logits, self.sampling_info) - return sample_output.batch_next_token_ids + return batch_next_token_ids diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 127f71900a..65daed43b2 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -31,7 +31,7 @@ from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer -from sglang.srt.layers.logits_processor import LogitsProcessorOutput +from sglang.srt.layers.logits_processor import LogitProcessorOutput from sglang.srt.managers.io_struct import ( AbortReq, BatchEmbeddingOut, @@ -505,29 +505,21 @@ def forward_prefill_batch(self, batch: ScheduleBatch): if self.model_runner.is_generation: # Forward and sample the next tokens if batch.extend_num_tokens != 0: - sample_output, logits_output = self.model_runner.forward( - batch, ForwardMode.EXTEND - ) - next_token_ids = batch.check_sample_results(sample_output) + output = self.model_runner.forward(batch, ForwardMode.EXTEND) + next_token_ids = batch.sample(output.next_token_logits) batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( next_token_ids ) # Move logprobs to cpu - if logits_output.next_token_logprobs is not None: - logits_output.next_token_logprobs = ( - logits_output.next_token_logprobs[ - torch.arange( - len(next_token_ids), device=next_token_ids.device - ), - next_token_ids, - ].tolist() - ) - logits_output.input_token_logprobs = ( - logits_output.input_token_logprobs.tolist() - ) - logits_output.normalized_prompt_logprobs = ( - logits_output.normalized_prompt_logprobs.tolist() + if output.next_token_logprobs is not None: + output.next_token_logprobs = output.next_token_logprobs[ + torch.arange(len(next_token_ids), device=next_token_ids.device), + next_token_ids, + ].tolist() + output.input_token_logprobs = output.input_token_logprobs.tolist() + output.normalized_prompt_logprobs = ( + output.normalized_prompt_logprobs.tolist() ) next_token_ids = next_token_ids.tolist() @@ -566,14 +558,12 @@ def forward_prefill_batch(self, batch: ScheduleBatch): self.req_to_token_pool.free(req.req_pool_idx) if req.return_logprob: - self.add_logprob_return_values( - i, req, pt, next_token_ids, logits_output - ) + self.add_logprob_return_values(i, req, pt, next_token_ids, output) pt += req.extend_input_len else: assert batch.extend_num_tokens != 0 - logits_output = self.model_runner.forward(batch, ForwardMode.EXTEND) - embeddings = logits_output.embeddings.tolist() + output = self.model_runner.forward(batch, ForwardMode.EXTEND) + embeddings = output.embeddings.tolist() # Check finish conditions for i, req in enumerate(batch.reqs): @@ -601,7 +591,7 @@ def add_logprob_return_values( req: Req, pt: int, next_token_ids: List[int], - output: LogitsProcessorOutput, + output: LogitProcessorOutput, ): if req.normalized_prompt_logprob is None: req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i] @@ -683,17 +673,15 @@ def forward_decode_batch(self, batch: ScheduleBatch): batch.prepare_for_decode() # Forward and sample the next tokens - sample_output, logits_output = self.model_runner.forward( - batch, ForwardMode.DECODE - ) - next_token_ids = batch.check_sample_results(sample_output) + output = self.model_runner.forward(batch, ForwardMode.DECODE) + next_token_ids = batch.sample(output.next_token_logits) batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( next_token_ids ) # Move logprobs to cpu - if logits_output.next_token_logprobs is not None: - next_token_logprobs = logits_output.next_token_logprobs[ + if output.next_token_logprobs is not None: + next_token_logprobs = output.next_token_logprobs[ torch.arange(len(next_token_ids), device=next_token_ids.device), next_token_ids, ].tolist() @@ -719,7 +707,7 @@ def forward_decode_batch(self, batch: ScheduleBatch): (next_token_logprobs[i], next_token_id) ) if req.top_logprobs_num > 0: - req.output_top_logprobs.append(logits_output.output_top_logprobs[i]) + req.output_top_logprobs.append(output.output_top_logprobs[i]) self.handle_finished_requests(batch) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 40c87af88c..796db26623 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -26,18 +26,16 @@ from vllm.model_executor.custom_op import CustomOp from sglang.srt.layers.logits_processor import ( + LogitProcessorOutput, LogitsMetadata, LogitsProcessor, - LogitsProcessorOutput, ) -from sglang.srt.layers.sampler import SampleOutput from sglang.srt.managers.schedule_batch import ScheduleBatch from sglang.srt.model_executor.forward_batch_info import ( ForwardMode, InputMetadata, update_flashinfer_indices, ) -from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo from sglang.srt.utils import monkey_patch_vllm_all_gather @@ -146,10 +144,6 @@ def __init__( self.flashinfer_kv_indices.clone(), ] - # Sampling inputs - vocab_size = model_runner.model_config.vocab_size - self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size) - self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else [] if use_torch_compile: @@ -241,7 +235,6 @@ def capture_one_batch_size(self, bs: int, forward: Callable): def run_once(): input_metadata = InputMetadata( forward_mode=ForwardMode.DECODE, - sampling_info=self.sampling_info[:bs], batch_size=bs, req_pool_indices=req_pool_indices, seq_lens=seq_lens, @@ -306,35 +299,27 @@ def replay(self, batch: ScheduleBatch): self.flashinfer_handlers[bs], ) - # Sampling inputs - self.sampling_info.inplace_assign(raw_bs, batch.sampling_info) - # Replay torch.cuda.synchronize() self.graphs[bs].replay() torch.cuda.synchronize() - sample_output, logits_output = self.output_buffers[bs] + output = self.output_buffers[bs] # Unpad if bs != raw_bs: - logits_output = LogitsProcessorOutput( - next_token_logits=logits_output.next_token_logits[:raw_bs], + output = LogitProcessorOutput( + next_token_logits=output.next_token_logits[:raw_bs], next_token_logprobs=None, normalized_prompt_logprobs=None, input_token_logprobs=None, input_top_logprobs=None, output_top_logprobs=None, ) - sample_output = SampleOutput( - sample_output.success[:raw_bs], - sample_output.probs[:raw_bs], - sample_output.batch_next_token_ids[:raw_bs], - ) # Extract logprobs if batch.return_logprob: - logits_output.next_token_logprobs = torch.nn.functional.log_softmax( - logits_output.next_token_logits, dim=-1 + output.next_token_logprobs = torch.nn.functional.log_softmax( + output.next_token_logits, dim=-1 ) return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums) if return_top_logprob: @@ -342,8 +327,8 @@ def replay(self, batch: ScheduleBatch): forward_mode=ForwardMode.DECODE, top_logprobs_nums=batch.top_logprobs_nums, ) - logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs( - logits_output.next_token_logprobs, logits_metadata + output.output_top_logprobs = LogitsProcessor.get_top_logprobs( + output.next_token_logprobs, logits_metadata )[1] - return sample_output, logits_output + return output diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index e8849962b0..c107b3bc82 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -1,5 +1,3 @@ -from __future__ import annotations - """ Copyright 2023-2024 SGLang Team Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,7 +16,7 @@ """ModelRunner runs the forward passes of the models.""" from dataclasses import dataclass from enum import IntEnum, auto -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING, List, Optional import numpy as np import torch @@ -28,7 +26,6 @@ if TYPE_CHECKING: from sglang.srt.model_executor.model_runner import ModelRunner - from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo class ForwardMode(IntEnum): @@ -45,7 +42,6 @@ class InputMetadata: """Store all inforamtion of a forward pass.""" forward_mode: ForwardMode - sampling_info: SamplingBatchInfo batch_size: int req_pool_indices: torch.Tensor seq_lens: torch.Tensor @@ -183,7 +179,6 @@ def from_schedule_batch( ): ret = cls( forward_mode=forward_mode, - sampling_info=batch.sampling_info, batch_size=batch.batch_size(), req_pool_indices=batch.req_pool_indices, seq_lens=batch.seq_lens, @@ -194,8 +189,6 @@ def from_schedule_batch( top_logprobs_nums=batch.top_logprobs_nums, ) - ret.sampling_info.prepare_penalties() - ret.compute_positions(batch) ret.compute_extend_infos(batch) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0dd9f8c201..abee152d6f 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -21,7 +21,7 @@ import logging import pkgutil from functools import lru_cache -from typing import Optional, Tuple, Type +from typing import Optional, Type import torch import torch.nn as nn @@ -44,8 +44,6 @@ from vllm.model_executor.models import ModelRegistry from sglang.global_config import global_config -from sglang.srt.layers.logits_processor import LogitsProcessorOutput -from sglang.srt.layers.sampler import SampleOutput from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict from sglang.srt.mem_cache.memory_pool import ( MHATokenToKVPool, @@ -517,11 +515,7 @@ def init_cuda_graphs(self): @torch.inference_mode() def forward_decode(self, batch: ScheduleBatch): - if ( - self.cuda_graph_runner - and self.cuda_graph_runner.can_run(len(batch.reqs)) - and not batch.sampling_info.has_bias() - ): + if self.cuda_graph_runner and self.cuda_graph_runner.can_run(len(batch.reqs)): return self.cuda_graph_runner.replay(batch) input_metadata = InputMetadata.from_schedule_batch( @@ -570,9 +564,7 @@ def forward_extend_multi_modal(self, batch: ScheduleBatch): input_metadata.image_offsets, ) - def forward( - self, batch: ScheduleBatch, forward_mode: ForwardMode - ) -> Tuple[SampleOutput, LogitsProcessorOutput]: + def forward(self, batch: ScheduleBatch, forward_mode: ForwardMode): if self.is_multimodal_model and forward_mode == ForwardMode.EXTEND: return self.forward_extend_multi_modal(batch) elif forward_mode == ForwardMode.DECODE: diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index 1c189eebbc..0a22f994bb 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -31,18 +31,20 @@ ) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata LoraConfig = None @@ -381,11 +383,17 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index c360106f97..f6d6f6e1f9 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -64,7 +64,6 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -327,7 +326,6 @@ def __init__( self.config = config self.quant_config = quant_config self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() self.model = CohereModel(config, quant_config) @torch.no_grad() @@ -342,11 +340,9 @@ def forward( positions, input_metadata, ) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index b3a76b56ae..39ac4aefa7 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -45,7 +45,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -383,7 +382,6 @@ def __init__( padding_size=DEFAULT_VOCAB_PADDING_SIZE, ) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -393,11 +391,9 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): expert_params_mapping = [ diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index b939602c1b..59fd1ec7ed 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -46,7 +46,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -386,7 +385,6 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -396,11 +394,9 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 15ecf4bb66..13dd477392 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -45,7 +45,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -633,7 +632,6 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() def forward( self, @@ -642,11 +640,9 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 61cc5c66ea..990937f518 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -37,7 +37,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -288,7 +287,6 @@ def __init__( self.quant_config = quant_config self.model = GemmaModel(config, quant_config=quant_config) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -299,11 +297,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return (sample_output, logits_output) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index fabf86b498..c6dbc7e556 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -41,7 +41,6 @@ from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -397,7 +396,6 @@ def __init__( self.quant_config = quant_config self.model = Gemma2Model(config, cache_config, quant_config) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -408,11 +406,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def get_attention_sliding_window_size(self): return get_attention_sliding_window_size(self.config) diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index dc828f0142..94b7f6153c 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -35,7 +35,6 @@ from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -262,7 +261,6 @@ def __init__( if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -272,11 +270,9 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 85a89ca3ed..4a0a08bf88 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -46,7 +46,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -298,7 +297,6 @@ def __init__( self.model = Grok1Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() # Monkey patch _prepare_weights to load pre-sharded weights setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights) @@ -315,11 +313,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index c0e4d19e12..f2947e991b 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -40,7 +40,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -263,7 +262,6 @@ def __init__( self.model = InternLM2Model(config, quant_config) self.output = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -274,11 +272,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.output.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index 42e9612303..9de8d33c5c 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -39,9 +39,8 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput +from sglang.srt.layers.logits_processor import LogitProcessorOutput, LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -303,7 +302,6 @@ def __init__( self.model = LlamaModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -312,13 +310,11 @@ def forward( positions: torch.Tensor, input_metadata: InputMetadata, input_embeds: torch.Tensor = None, - ) -> LogitsProcessorOutput: + ) -> LogitProcessorOutput: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def get_module_name(self, name): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index fdf6d28e55..02224971d6 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.layers.logits_processor import LogitsProcessorOutput +from sglang.srt.layers.logits_processor import LogitProcessorOutput from sglang.srt.model_executor.forward_batch_info import InputMetadata from sglang.srt.models.llama2 import LlamaModel @@ -65,7 +65,7 @@ def forward( (input_metadata.batch_size, self.config.classification_out_size) ).to(input_ids.device) - return LogitsProcessorOutput( + return LogitProcessorOutput( next_token_logits=scores, next_token_logprobs=scores, normalized_prompt_logprobs=scores, diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index 0028ae67a8..49ff1926f3 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -39,7 +39,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -298,7 +297,6 @@ def __init__( self.scale_width = self.config.hidden_size / self.config.dim_model_base self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -316,11 +314,9 @@ def forward( lm_head_weight = self.model.embed_tokens.weight else: lm_head_weight = self.lm_head.weight - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, lm_head_weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index ca38cb03ba..d11f6c9519 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -41,7 +41,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -300,7 +299,6 @@ def __init__( self.model = MixtralModel(config, quant_config=quant_config, prefix="model") self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() def forward( self, @@ -310,11 +308,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index 97ac09ee62..b02e925c5a 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -45,7 +45,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -334,7 +333,6 @@ def __init__( self.model = MixtralModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -345,11 +343,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 4958a81298..93dae9585c 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -39,7 +39,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -252,7 +251,6 @@ def __init__( vocab_size = ((config.vocab_size + 63) // 64) * 64 self.lm_head = ParallelLMHead(vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -262,11 +260,10 @@ def forward( input_metadata: InputMetadata, ): hidden_states = self.transformer(input_ids, positions, input_metadata) - logits_output = self.logits_processor( + next_tokens = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output + return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 76094b907a..fcf083e1b5 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -38,9 +38,8 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.pooler import Pooler, PoolingType +from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata Qwen2Config = None @@ -277,7 +276,6 @@ def __init__( self.model = Qwen2Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) @torch.no_grad() @@ -291,11 +289,9 @@ def forward( ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) if not get_embedding: - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output else: return self.pooler(hidden_states, input_metadata) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index e08695bc61..9bdbd75066 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -35,8 +35,10 @@ ReplicatedLinear, RowParallelLinear, ) +from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -47,7 +49,6 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -365,7 +366,6 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -376,11 +376,20 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output + + def compute_logits( + self, + input_ids: torch.Tensor, + hidden_states: torch.Tensor, + input_metadata: InputMetadata, + ) -> torch.Tensor: + logits = self.logits_processor( + input_ids, hidden_states, self.lm_head.weight, input_metadata + ) + return logits def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index a3102baabd..9e10f12f2a 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -40,7 +40,6 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -250,7 +249,6 @@ def __init__( self.model = StableLMEpochModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) - self.sampler = Sampler() @torch.no_grad() def forward( @@ -261,11 +259,9 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - logits_output = self.logits_processor( + return self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - sample_output = self.sampler(logits_output, input_metadata.sampling_info) - return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py index 7843f4bd32..bc70a9018e 100644 --- a/python/sglang/srt/sampling/sampling_batch_info.py +++ b/python/sglang/srt/sampling/sampling_batch_info.py @@ -21,63 +21,10 @@ class SamplingBatchInfo: top_ps: torch.Tensor = None top_ks: torch.Tensor = None min_ps: torch.Tensor = None - - # Dispatch in CUDA graph - need_min_p_sampling: bool = False - - # Bias Tensors + penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None logit_bias: torch.Tensor = None vocab_mask: torch.Tensor = None - # Penalizer - penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None - linear_penalties: torch.Tensor = None - scaling_penalties: torch.Tensor = None - - def has_bias(self): - return ( - self.logit_bias is not None - or self.vocab_mask is not None - or self.linear_penalties is not None - or self.scaling_penalties is not None - ) - - @classmethod - def dummy_one(cls, max_bs: int, vocab_size: int): - ret = cls(vocab_size=vocab_size) - ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda") - ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda") - ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda") - ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda") - return ret - - def __getitem__(self, key): - if isinstance(key, slice): - # NOTE: We do not use cuda graph when there is bias tensors - assert not self.has_bias() - return SamplingBatchInfo( - vocab_size=self.vocab_size, - temperatures=self.temperatures[key], - top_ps=self.top_ps[key], - top_ks=self.top_ks[key], - min_ps=self.min_ps[key], - need_min_p_sampling=self.need_min_p_sampling, - ) - else: - raise NotImplementedError - - def inplace_assign(self, bs: int, other: SamplingBatchInfo): - # NOTE: We do not use cuda graph when there is bias tensors - assert not self.has_bias() - - self.vocab_size = other.vocab_size - self.need_min_p_sampling = other.need_min_p_sampling - - self.temperatures[:bs] = other.temperatures - self.top_ps[:bs] = other.top_ps - self.top_ks[:bs] = other.top_ks - self.min_ps[:bs] = other.min_ps - @classmethod def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): device = "cuda" @@ -98,7 +45,6 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): ret.min_ps = torch.tensor( [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device ) - ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs) # Each penalizers will do nothing if they evaluate themselves as not required by looking at # the sampling_params of the requests (See {_is_required()} of each penalizers). So this @@ -126,25 +72,6 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): return ret - def prepare_penalties(self): - self.scaling_penalties = None - self.linear_penalties = None - - for penalizer in self.penalizer_orchestrator.penalizers.values(): - if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer): - if penalizer.is_prepared(): - self.scaling_penalties = penalizer.cumulated_repetition_penalties - else: - if penalizer.is_prepared(): - if self.linear_penalties is None: - bs = self.penalizer_orchestrator.batch.batch_size() - self.linear_penalties = torch.zeros( - (bs, self.vocab_size), - dtype=torch.float32, - device="cuda", - ) - self.linear_penalties = penalizer.apply(self.linear_penalties) - def update_regex_vocab_mask(self, batch: ScheduleBatch): bs, reqs = batch.batch_size(), batch.reqs device = "cuda" diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 2d3b0aefa3..37ed2cf9ad 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -180,7 +180,7 @@ def __init__( tp_size=tp_size, dtype=get_dtype_str(torch_dtype), port=port, - mem_fraction_static=0.69, + mem_fraction_static=0.7, trust_remote_code=False, is_embedding=not self.is_generation, ) diff --git a/python/sglang/version.py b/python/sglang/version.py index f3291e93b7..839b265519 100644 --- a/python/sglang/version.py +++ b/python/sglang/version.py @@ -1 +1 @@ -__version__ = "0.2.14" +__version__ = "0.2.14.post1" From 6c498313942b32e548dd0b499f279db0abc5b085 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 28 Aug 2024 04:20:54 -0700 Subject: [PATCH 105/118] Add sglang.bench_latency to CI (#1243) --- .github/workflows/e2e-test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 7f555110d9..11c94775c1 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -38,6 +38,11 @@ jobs: cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default + - name: Benchmark Serving Latency + timeout-minutes: 10 + run: | + python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 + - name: Benchmark Serving Throughput (w/o RadixAttention) timeout-minutes: 10 run: | From 66975360e7575a5f573cdaf5c6892d81afc3ed19 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 28 Aug 2024 22:12:36 +1000 Subject: [PATCH 106/118] fix: increase max_new_tokens when testing generation models (#1244) --- python/sglang/test/runners.py | 2 +- test/srt/models/test_generation_models.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 37ed2cf9ad..e69d699a7d 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -30,7 +30,7 @@ # the output of gemma-2-2b from SRT is unstable on the commented prompt # "The capital of France is", "Apple is red. Banana is Yellow. " * 800 + "Apple is", - "The capital of the United Kindom is", + "The capital of the United Kingdom is", "Today is a sunny day and I like", "AI is a field of computer science focused on", ] diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index b953ccf5d6..e38584741e 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -62,7 +62,6 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2): class TestGenerationModels(unittest.TestCase): - def assert_close_prefill_logits_and_output_strs( self, prompts, @@ -99,14 +98,15 @@ def assert_close_prefill_logits_and_output_strs( abs(hf_logprobs - srt_logprobs) < prefill_tolerance ), "prefill logprobs are not all close" - print(hf_outputs.output_strs) - print(srt_outputs.output_strs) + print(f"hf_outputs.output_strs={hf_outputs.output_strs}") + print(f"srt_outputs.output_strs={srt_outputs.output_strs}") rouge_l_scores = calculate_rouge_l( hf_outputs.output_strs, srt_outputs.output_strs ) + print(f"rouge_l_scores={rouge_l_scores}") assert all( score >= rouge_threshold for score in rouge_l_scores - ), f"Not all ROUGE-L scores are greater than {rouge_threshold}" + ), f"Not all ROUGE-L scores are greater than rouge_threshold={rouge_threshold}" def test_prefill_logits_and_output_strs(self): for ( @@ -117,7 +117,7 @@ def test_prefill_logits_and_output_strs(self): rouge_threshold, ) in MODELS: for torch_dtype in TORCH_DTYPES: - max_new_tokens = 8 + max_new_tokens = 32 self.assert_close_prefill_logits_and_output_strs( DEFAULT_PROMPTS, model, From b1a540ec42cdd7b2875ce4b84587c522458bc065 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 28 Aug 2024 22:47:34 +1000 Subject: [PATCH 107/118] feat: update GemmaRMSNorm (#1232) --- python/sglang/srt/layers/layernorm.py | 48 +++++++++++++++++++++++- python/sglang/srt/models/gemma2.py | 52 +------------------------- python/sglang/test/test_layernorm.py | 54 ++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 53 deletions(-) diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index ac4d368d3f..4c24f50ffe 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -19,7 +19,12 @@ import torch import torch.nn as nn -from flashinfer.norm import fused_add_rmsnorm, rmsnorm +from flashinfer.norm import ( + fused_add_rmsnorm, + gemma_fused_add_rmsnorm, + gemma_rmsnorm, + rmsnorm, +) from vllm.model_executor.custom_op import CustomOp @@ -63,3 +68,44 @@ def forward_native( return x else: return x, residual + + +class GemmaRMSNorm(CustomOp): + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + ) -> None: + super().__init__() + self.weight = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward_native( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + orig_dtype = x.dtype + if residual is not None: + x = x + residual + residual = x + + x = x.float() + variance = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + x = x * (1.0 + self.weight.float()) + x = x.to(orig_dtype) + return x if residual is None else (x, residual) + + def forward_cuda( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if residual is not None: + gemma_fused_add_rmsnorm( + x, residual, self.weight.data, self.variance_epsilon + ) + return x, residual + out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon) + return out diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index c6dbc7e556..3223424d79 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -22,11 +22,6 @@ from transformers import PretrainedConfig from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size - -# FIXME: temporary solution, remove after next vllm release -from vllm.model_executor.custom_op import CustomOp - -# from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -39,6 +34,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.activation import GeluAndMul +from sglang.srt.layers.layernorm import GemmaRMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -50,52 +46,6 @@ def get_attention_sliding_window_size(config): return config.sliding_window - 1 -class GemmaRMSNorm(CustomOp): - """RMS normalization for Gemma. - - Two differences from the above RMSNorm: - 1. x * (1 + w) instead of x * w. - 2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w. - """ - - def __init__( - self, - hidden_size: int, - eps: float = 1e-6, - ) -> None: - super().__init__() - self.weight = nn.Parameter(torch.zeros(hidden_size)) - self.variance_epsilon = eps - - def forward_native( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - """PyTorch-native implementation equivalent to forward().""" - orig_dtype = x.dtype - if residual is not None: - x = x + residual - residual = x - - x = x.float() - variance = x.pow(2).mean(dim=-1, keepdim=True) - x = x * torch.rsqrt(variance + self.variance_epsilon) - # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16) - # See https://github.com/huggingface/transformers/pull/29402 - x = x * (1.0 + self.weight.float()) - x = x.to(orig_dtype) - return x if residual is None else (x, residual) - - def forward_cuda( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - # from vLLM: TODO(woosuk): Implement an optimized kernel for GemmaRMSNorm. - return self.forward_native(x, residual) - - # FIXME: temporary solution, remove after next vllm release from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding diff --git a/python/sglang/test/test_layernorm.py b/python/sglang/test/test_layernorm.py index ab61aa8040..770e69733d 100644 --- a/python/sglang/test/test_layernorm.py +++ b/python/sglang/test/test_layernorm.py @@ -3,7 +3,7 @@ import torch -from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm class TestRMSNorm(unittest.TestCase): @@ -56,5 +56,57 @@ def test_rms_norm(self): self._run_rms_norm_test(*params) +class TestGemmaRMSNorm(unittest.TestCase): + DTYPES = [torch.half, torch.bfloat16] + NUM_TOKENS = [7, 83, 4096] + HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199] + ADD_RESIDUAL = [False, True] + SEEDS = [0] + + @classmethod + def setUpClass(cls): + if not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + torch.set_default_device("cuda") + + def _run_gemma_rms_norm_test( + self, num_tokens, hidden_size, add_residual, dtype, seed + ): + torch.manual_seed(seed) + + layer = GemmaRMSNorm(hidden_size).to(dtype=dtype) + layer.weight.data.normal_(mean=1.0, std=0.1) + scale = 1 / (2 * hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale + residual = torch.randn_like(x) * scale if add_residual else None + + with torch.inference_mode(): + ref_out = layer.forward_native(x, residual) + out = layer(x, residual) + + if add_residual: + self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-3, rtol=1e-3)) + self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-3, rtol=1e-3)) + else: + self.assertTrue(torch.allclose(out, ref_out, atol=1e-3, rtol=1e-3)) + + def test_gemma_rms_norm(self): + for params in itertools.product( + self.NUM_TOKENS, + self.HIDDEN_SIZES, + self.ADD_RESIDUAL, + self.DTYPES, + self.SEEDS, + ): + with self.subTest( + num_tokens=params[0], + hidden_size=params[1], + add_residual=params[2], + dtype=params[3], + seed=params[4], + ): + self._run_gemma_rms_norm_test(*params) + + if __name__ == "__main__": unittest.main(verbosity=2) From bf53bf5142bd3393d495608e58c86f6d8c991664 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 28 Aug 2024 06:33:05 -0700 Subject: [PATCH 108/118] [Fix] Fix llava on multi images (#1247) --- README.md | 2 +- .../usage/llava_video/srt_example_llava_v.py | 13 +- python/sglang/launch_server_llavavid.py | 26 +++ python/sglang/srt/hf_transformers_utils.py | 149 ------------------ python/sglang/srt/managers/io_struct.py | 9 +- python/sglang/srt/managers/schedule_batch.py | 10 +- .../sglang/srt/managers/tokenizer_manager.py | 135 +++++++++------- python/sglang/srt/managers/tp_worker.py | 19 ++- .../srt/model_executor/forward_batch_info.py | 30 ++-- .../sglang/srt/model_executor/model_runner.py | 21 ++- python/sglang/srt/models/chatglm.py | 2 +- python/sglang/srt/models/grok.py | 12 +- python/sglang/srt/models/llama2.py | 7 +- .../sglang/srt/models/llama_classification.py | 4 - python/sglang/srt/models/llama_embedding.py | 7 +- python/sglang/srt/models/llava.py | 111 +++++-------- python/sglang/srt/models/llavavid.py | 126 +++++---------- python/sglang/srt/models/qwen2.py | 7 +- python/sglang/srt/models/yivl.py | 9 +- python/sglang/srt/server.py | 8 +- python/sglang/srt/utils.py | 51 +++--- test/srt/test_vision_openai_server.py | 2 - 22 files changed, 272 insertions(+), 488 deletions(-) create mode 100644 python/sglang/launch_server_llavavid.py diff --git a/README.md b/README.md index 223f9624f6..9d795ce438 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - Qwen / Qwen 2 / Qwen 2 MoE - DeepSeek / DeepSeek 2 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/) - - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384` + - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava` - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py) - LLaVA 1.5 / 1.6 / NeXT - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3` diff --git a/examples/frontend_language/usage/llava_video/srt_example_llava_v.py b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py index 085bcea5a2..1f2931a5a4 100644 --- a/examples/frontend_language/usage/llava_video/srt_example_llava_v.py +++ b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py @@ -184,13 +184,9 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= # Parse the arguments args = parser.parse_args() - cur_port = args.port - cur_chunk = args.chunk_idx - num_chunks = args.num_chunks - num_frames = args.num_frames if "34b" in args.model_path.lower(): @@ -202,7 +198,6 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= exit() model_overide_args = {} - model_overide_args["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride model_overide_args["architectures"] = ["LlavaVidForCausalLM"] model_overide_args["num_frames"] = args.num_frames @@ -235,7 +230,6 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= print(f"chat template: {runtime.endpoint.chat_template.name}") # Run a single request - # try: print("\n========== single ==========\n") root = args.video_dir if os.path.isfile(root): @@ -257,13 +251,10 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= ) # Calculate the average processing time print(f"Average processing time per video: {average_time:.2f} seconds") runtime.shutdown() - # except Exception as e: - # print(e) - runtime.shutdown() - # # # Run a batch of requests + # # Run a batch of requests # print("\n========== batch ==========\n") # if not os.path.exists(args.save_dir): # os.makedirs(args.save_dir) - # batch(args.video_dir,args.save_dir,cur_chunk, num_chunks, num_frames, num_chunks) + # batch(args.video_dir, args.save_dir, cur_chunk, num_chunks, num_frames, num_chunks) # runtime.shutdown() diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py new file mode 100644 index 0000000000..797ad07a47 --- /dev/null +++ b/python/sglang/launch_server_llavavid.py @@ -0,0 +1,26 @@ +"""Launch the inference server for Llava-video model.""" + +import argparse + +from sglang.srt.server import ServerArgs, launch_server + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ServerArgs.add_cli_args(parser) + args = parser.parse_args() + server_args = ServerArgs.from_cli_args(args) + + model_overide_args = {} + model_overide_args["mm_spatial_pool_stride"] = 2 + model_overide_args["architectures"] = ["LlavaVidForCausalLM"] + model_overide_args["num_frames"] = 16 + model_overide_args["model_type"] = "llavavid" + if model_overide_args["num_frames"] == 32: + model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"} + model_overide_args["max_sequence_length"] = 4096 * 2 + model_overide_args["tokenizer_model_max_length"] = 4096 * 2 + model_overide_args["model_max_length"] = 4096 * 2 + if "34b" in args.model_path.lower(): + model_overide_args["image_token_index"] = 64002 + + launch_server(server_args, model_overide_args, None) diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 4f6e3d0715..2be4169140 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -119,24 +119,7 @@ def get_tokenizer( tokenizer_revision: Optional[str] = None, **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - if tokenizer_name.endswith(".json"): - return TiktokenTokenizer(tokenizer_name) - - if tokenizer_name.endswith(".model"): - return SentencePieceTokenizer(tokenizer_name) - """Gets a tokenizer for the given model name via Huggingface.""" - if is_multimodal_model(tokenizer_name): - processor = get_processor( - tokenizer_name, - *args, - trust_remote_code=trust_remote_code, - tokenizer_revision=tokenizer_revision, - **kwargs, - ) - tokenizer = processor.tokenizer - return tokenizer - if tokenizer_mode == "slow": if kwargs.get("use_fast", False): raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") @@ -199,135 +182,3 @@ def get_processor( **kwargs, ) return processor - - -class TiktokenTokenizer: - def __init__(self, tokenizer_path): - import tiktoken - from jinja2 import Template - - PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" - - # Read JSON - name = "tmp-json" - with open(tokenizer_path, "rb") as fin: - tok_dict = json.load(fin) - - mergeable_ranks = { - bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"] - } - special_tokens = { - bytes(item["bytes"]).decode(): item["token"] - for item in tok_dict["special_tokens"] - } - assert tok_dict["word_split"] == "V1" - - default_allowed_special = None - - kwargs = { - "name": name, - "pat_str": tok_dict.get("pat_str", PAT_STR_B), - "mergeable_ranks": mergeable_ranks, - "special_tokens": special_tokens, - } - if "default_allowed_special" in tok_dict: - default_allowed_special = set( - [ - bytes(bytes_list).decode() - for bytes_list in tok_dict["default_allowed_special"] - ] - ) - if "vocab_size" in tok_dict: - kwargs["explicit_n_vocab"] = tok_dict["vocab_size"] - - PAD = "<|pad|>" - EOS = "<|eos|>" - SEP = "<|separator|>" - - DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP} - - tokenizer = tiktoken.Encoding(**kwargs) - tokenizer._default_allowed_special = default_allowed_special or set() - tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS - - def encode_patched( - self, - text: str, - *, - allowed_special: Union[ - Literal["all"], AbstractSet[str] - ] = set(), # noqa: B006 - disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> List[int]: - if isinstance(allowed_special, set): - allowed_special |= self._default_allowed_special - return tiktoken.Encoding.encode( - self, - text, - allowed_special=allowed_special, - disallowed_special=(), - ) - - tokenizer.encode = functools.partial(encode_patched, tokenizer) - - # Convert to HF interface - self.tokenizer = tokenizer - self.eos_token_id = tokenizer._special_tokens[EOS] - self.vocab_size = tokenizer.n_vocab - self.chat_template = Template( - "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" - ) - - def encode(self, x, add_special_tokens=False): - return self.tokenizer.encode(x) - - def decode(self, x): - return self.tokenizer.decode(x) - - def batch_decode( - self, batch, skip_special_tokens=True, spaces_between_special_tokens=False - ): - if isinstance(batch[0], int): - batch = [[x] for x in batch] - return self.tokenizer.decode_batch(batch) - - def apply_chat_template(self, messages, tokenize, add_generation_prompt): - ret = self.chat_template.render( - messages=messages, add_generation_prompt=add_generation_prompt - ) - return self.encode(ret) if tokenize else ret - - -class SentencePieceTokenizer: - def __init__(self, tokenizer_path): - import sentencepiece as spm - from jinja2 import Template - - tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path) - - # Convert to HF interface - self.tokenizer = tokenizer - self.eos_token_id = tokenizer.eos_id() - self.vocab_size = tokenizer.vocab_size() - self.chat_template = Template( - "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" - ) - - def encode(self, x, add_special_tokens=False): - return self.tokenizer.encode(x) - - def decode(self, x): - return self.tokenizer.decode(x) - - def batch_decode( - self, batch, skip_special_tokens=True, spaces_between_special_tokens=False - ): - if isinstance(batch[0], int): - batch = [[x] for x in batch] - return self.tokenizer.decode(batch) - - def apply_chat_template(self, messages, tokenize, add_generation_prompt): - ret = self.chat_template.render( - messages=messages, add_generation_prompt=add_generation_prompt - ) - return self.encode(ret) if tokenize else ret diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 56e3d8f799..3f80c64cf9 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -55,6 +55,7 @@ def post_init(self): self.text is not None and self.input_ids is not None ): raise ValueError("Either text or input_ids should be provided.") + if ( isinstance(self.sampling_params, dict) and self.sampling_params.get("n", 1) != 1 @@ -161,10 +162,10 @@ class TokenizedGenerateReqInput: input_ids: List[int] # The pixel values for input images pixel_values: List[float] - # The hash of input images - image_hash: int - # The image size - image_size: List[int] + # The hash values of input images + image_hashes: List[int] + # The image sizes + image_sizes: List[List[int]] # The sampling parameters sampling_params: SamplingParams # Whether to return the logprobs diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index f3af821e4e..5554170a35 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -121,8 +121,8 @@ def __init__(self, rid, origin_input_text, origin_input_ids): # For vision input self.pixel_values = None - self.image_size = None - self.image_offset = None + self.image_sizes = None + self.image_offsets = None self.pad_value = None # Prefix info @@ -600,12 +600,12 @@ def check_for_jump_forward(self, model_runner): if req.pixel_values is not None: ( req.origin_input_ids, - req.image_offset, + req.image_offsets, ) = model_runner.model.pad_input_ids( req.origin_input_ids_unpadded, req.pad_value, - req.pixel_values.shape, - req.image_size, + req.pixel_values, + req.image_sizes, ) jump_forward_reqs.append(req) diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index c74251947b..5ad4152ea9 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -23,6 +23,7 @@ import os from typing import Dict, List, Optional, Tuple, Union +import fastapi import numpy as np import transformers import uvloop @@ -96,21 +97,18 @@ def __init__( trust_remote_code=server_args.trust_remote_code, model_overide_args=model_overide_args, ) - self.is_generation = is_generation_model( self.hf_config.architectures, self.server_args.is_embedding ) - - if server_args.context_length is not None: - self.context_len = server_args.context_length - else: - self.context_len = get_context_length(self.hf_config) + self.context_len = server_args.context_length or get_context_length( + self.hf_config + ) # Create tokenizer if server_args.skip_tokenizer_init: self.tokenizer = self.processor = None else: - if is_multimodal_model(self.model_path): + if is_multimodal_model(self.hf_config.architectures): self.processor = get_processor( server_args.tokenizer_path, tokenizer_mode=server_args.tokenizer_mode, @@ -118,6 +116,9 @@ def __init__( ) self.tokenizer = self.processor.tokenizer os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # We want to parallelize the image pre-processing so we + # create an executor for it self.executor = concurrent.futures.ProcessPoolExecutor( initializer=init_global_processor, mp_context=mp.get_context("fork"), @@ -134,12 +135,14 @@ def __init__( self.to_create_loop = True self.rid_to_state: Dict[str, ReqState] = {} - # for update model weights + # For update model weights self.model_update_lock = asyncio.Lock() self.model_update_result = None async def generate_request( - self, obj: Union[GenerateReqInput, EmbeddingReqInput], request=None + self, + obj: Union[GenerateReqInput, EmbeddingReqInput], + request: Optional[fastapi.Request] = None, ): if self.to_create_loop: self.create_handle_loop() @@ -160,7 +163,7 @@ async def generate_request( async def _handle_single_request( self, obj: Union[GenerateReqInput, EmbeddingReqInput], - request, + request: Optional[fastapi.Request] = None, index: Optional[int] = None, is_cache_for_prefill: Optional[bool] = False, ): @@ -182,8 +185,8 @@ async def _handle_single_request( ) if self.is_generation: - pixel_values, image_hash, image_size = await self._get_pixel_values( - obj.image_data + pixel_values, image_hashes, image_sizes = await self._get_pixel_values( + obj.image_data if not_use_index else obj.image_data[index] ) return_logprob = ( obj.return_logprob if not_use_index else obj.return_logprob[index] @@ -195,7 +198,6 @@ async def _handle_single_request( ) if return_logprob and logprob_start_len == -1: logprob_start_len = len(input_ids) - 1 - top_logprobs_num = ( obj.top_logprobs_num if not_use_index @@ -238,13 +240,14 @@ async def _handle_single_request( sampling_params = SamplingParams(**obj.sampling_params[0]) sampling_params.max_new_tokens = 0 - pixel_values, image_hash, image_size = await self._get_pixel_values( + pixel_values, image_hashes, image_sizes = await self._get_pixel_values( obj.image_data[0] ) return_logprob = obj.return_logprob[0] logprob_start_len = obj.logprob_start_len[0] top_logprobs_num = obj.top_logprobs_num[0] + # Send to the controller if self.is_generation: if return_logprob and logprob_start_len == -1: logprob_start_len = len(input_ids) - 1 @@ -253,8 +256,8 @@ async def _handle_single_request( input_text, input_ids, pixel_values, - image_hash, - image_size, + image_hashes, + image_sizes, sampling_params, return_logprob, logprob_start_len, @@ -268,24 +271,24 @@ async def _handle_single_request( input_ids, sampling_params, ) - self.send_to_router.send_pyobj(tokenized_obj) + # Recv results event = asyncio.Event() state = ReqState([], False, event) self.rid_to_state[rid] = state if not is_cache_for_prefill: - async for response in self._wait_for_response( - event, state, obj, rid, request - ): + async for response in self._wait_for_response(state, obj, rid, request): yield response else: assert self.is_generation - await self._wait_for_cache_prefill_response(event, state, obj, rid, request) + await self._wait_for_cache_prefill_response(state, obj, rid, request) yield input_ids async def _handle_batch_request( - self, obj: Union[GenerateReqInput, EmbeddingReqInput], request + self, + obj: Union[GenerateReqInput, EmbeddingReqInput], + request: Optional[fastapi.Request] = None, ): batch_size = obj.batch_size if self.is_generation: @@ -340,8 +343,8 @@ async def _handle_batch_request( if self.is_generation: if obj.return_logprob[index] and obj.logprob_start_len[index] == -1: obj.logprob_start_len[index] = len(input_ids) - 1 - pixel_values, image_hash, image_size = await self._get_pixel_values( - obj.image_data[index] + pixel_values, image_hashes, image_sizes = ( + await self._get_pixel_values(obj.image_data[index]) ) tokenized_obj = TokenizedGenerateReqInput( @@ -349,8 +352,8 @@ async def _handle_batch_request( input_text, input_ids, pixel_values, - image_hash, - image_size, + image_hashes, + image_sizes, sampling_params, obj.return_logprob[index], obj.logprob_start_len[index], @@ -372,7 +375,6 @@ async def _handle_batch_request( generators.append( self._wait_for_response( - event, state, obj, rid, @@ -388,6 +390,7 @@ async def _handle_batch_request( tasks = [asyncio.create_task(gen.__anext__()) for gen in generators] output_list = [None] * len(tasks) + # Recv results while tasks: done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) @@ -426,25 +429,18 @@ def _get_sampling_params(self, sampling_params_data: dict): sampling_params.verify() return sampling_params - async def _get_pixel_values(self, image_data): - if image_data is None: - return None, None, None - else: - return await self._get_pixel_values_internal(image_data) - async def _wait_for_response( self, - event: asyncio.Event, state: ReqState, obj: Union[GenerateReqInput, EmbeddingReqInput], rid: str, - request, - index: int = None, + request: Optional[fastapi.Request] = None, + index: Optional[int] = None, response_index: int = 0, ): while True: try: - await asyncio.wait_for(event.wait(), timeout=4) + await asyncio.wait_for(state.event.wait(), timeout=4) except asyncio.TimeoutError: if request is not None and await request.is_disconnected(): for rid in [obj.rid] if obj.is_single else obj.rid: @@ -478,16 +474,15 @@ async def _wait_for_response( yield out break - event.clear() + state.event.clear() yield out async def _wait_for_cache_prefill_response( self, - event: asyncio.Event, state: ReqState, obj: GenerateReqInput, rid: str, - request, + request: Optional[fastapi.Request] = None, ): while True: try: @@ -514,7 +509,9 @@ def abort_request(self, rid: str): req = AbortReq(rid) self.send_to_router.send_pyobj(req) - async def update_weights(self, obj: UpdateWeightReqInput, request): + async def update_weights( + self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None + ): if self.to_create_loop: self.create_handle_loop() @@ -659,12 +656,11 @@ def detokenize_top_logprobs_tokens(self, top_logprobs, decode_to_text: bool): ) return top_logprobs - async def _get_pixel_values_internal(self, image_data, aspect_ratio=None): - aspect_ratio = ( - getattr(self.hf_config, "image_aspect_ratio", None) - if aspect_ratio is None - else aspect_ratio - ) + async def _get_pixel_values(self, image_data: List[Union[str, bytes]]): + if not image_data: + return None, None, None + + aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None) grid_pinpoints = ( self.hf_config.image_grid_pinpoints if hasattr(self.hf_config, "image_grid_pinpoints") @@ -673,35 +669,42 @@ async def _get_pixel_values_internal(self, image_data, aspect_ratio=None): ) if isinstance(image_data, list) and len(image_data) > 0: - pixel_values, image_hash, image_size = [], [], [] + # Multiple images if len(image_data) > 1: aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres + pixel_values, image_hashes, image_sizes = [], [], [] for img_data in image_data: pixel_v, image_h, image_s = await self._process_single_image( img_data, aspect_ratio, grid_pinpoints ) pixel_values.append(pixel_v) - image_hash.append(image_h) - image_size.append(image_s) - pixel_values = np.stack(pixel_values, axis=0) + image_hashes.append(image_h) + image_sizes.append(image_s) + + if isinstance(pixel_values[0], np.ndarray): + pixel_values = np.stack(pixel_values, axis=0) else: + # A single image pixel_values, image_hash, image_size = await self._process_single_image( image_data[0], aspect_ratio, grid_pinpoints ) - image_hash = [image_hash] - image_size = [image_size] + image_hashes = [image_hash] + image_sizes = [image_size] elif isinstance(image_data, str): + # A single image pixel_values, image_hash, image_size = await self._process_single_image( image_data, aspect_ratio, grid_pinpoints ) - image_hash = [image_hash] - image_size = [image_size] + image_hashes = [image_hash] + image_sizes = [image_size] else: - pixel_values, image_hash, image_size = None, None, None + raise ValueError(f"Invalid image data: {image_data}") - return pixel_values, image_hash, image_size + return pixel_values, image_hashes, image_sizes - async def _process_single_image(self, image_data, aspect_ratio, grid_pinpoints): + async def _process_single_image( + self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str + ): if self.executor is not None: loop = asyncio.get_event_loop() return await loop.run_in_executor( @@ -732,12 +735,16 @@ def init_global_processor(server_args: ServerArgs): def _process_single_image_task( - image_data, image_aspect_ratio=None, image_grid_pinpoints=None, processor=None + image_data: Union[str, bytes], + image_aspect_ratio: Optional[str] = None, + image_grid_pinpoints: Optional[str] = None, + processor=None, ): try: processor = processor or global_processor image, image_size = load_image(image_data) if image_size is not None: + # It is a video with multiple images image_hash = hash(image_data) pixel_values = processor.image_processor(image)["pixel_values"] for _ in range(len(pixel_values)): @@ -745,6 +752,7 @@ def _process_single_image_task( pixel_values = np.stack(pixel_values, axis=0) return pixel_values, image_hash, image_size else: + # It is an image image_hash = hash(image_data) if image_aspect_ratio == "pad": image = expand2square( @@ -754,13 +762,18 @@ def _process_single_image_task( pixel_values = processor.image_processor(image.convert("RGB"))[ "pixel_values" ][0] - elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: + elif image_aspect_ratio == "anyres" or ( + image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio + ): pixel_values = process_anyres_image( image, processor.image_processor, image_grid_pinpoints ) else: pixel_values = processor.image_processor(image)["pixel_values"][0] - pixel_values = pixel_values.astype(np.float16) + + if isinstance(pixel_values, np.ndarray): + pixel_values = pixel_values.astype(np.float16) + return pixel_values, image_hash, image.size except Exception: logger.error("Exception in TokenizerManager:\n" + get_exception_traceback()) diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 65daed43b2..cd1b580643 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -108,7 +108,7 @@ def __init__( if server_args.skip_tokenizer_init: self.tokenizer = self.processor = None else: - if is_multimodal_model(server_args.model_path): + if is_multimodal_model(self.model_config.hf_config.architectures): self.processor = get_processor( server_args.tokenizer_path, tokenizer_mode=server_args.tokenizer_mode, @@ -333,26 +333,24 @@ def handle_generate_request( if self.model_runner.is_generation: req.pixel_values = recv_req.pixel_values if req.pixel_values is not None: - image_hash = ( - hash(tuple(recv_req.image_hash)) - if isinstance(recv_req.image_hash, list) - else recv_req.image_hash - ) + # Use image hash as fake token_ids, which is then used + # for prefix matching + image_hash = hash(tuple(recv_req.image_hashes)) req.pad_value = [ (image_hash) % self.model_config.vocab_size, (image_hash >> 16) % self.model_config.vocab_size, (image_hash >> 32) % self.model_config.vocab_size, (image_hash >> 64) % self.model_config.vocab_size, ] - req.image_size = recv_req.image_size + req.image_sizes = recv_req.image_sizes ( req.origin_input_ids, - req.image_offset, + req.image_offsets, ) = self.model_runner.model.pad_input_ids( req.origin_input_ids_unpadded, req.pad_value, - req.pixel_values.shape, - req.image_size, + req.pixel_values, + req.image_sizes, ) req.return_logprob = recv_req.return_logprob req.logprob_start_len = recv_req.logprob_start_len @@ -368,6 +366,7 @@ def handle_generate_request( req.jump_forward_map = self.jump_forward_cache.query( computed_regex_string ) + # Init regex fsm elif req.sampling_params.regex is not None: req.regex_fsm = self.regex_fsm_cache.query(req.sampling_params.regex) diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index c107b3bc82..f24cdf6b72 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -16,7 +16,7 @@ """ModelRunner runs the forward passes of the models.""" from dataclasses import dataclass from enum import IntEnum, auto -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List import numpy as np import torch @@ -58,6 +58,7 @@ class InputMetadata: # For extend extend_seq_lens: torch.Tensor = None + extend_prefix_lens: torch.Tensor = None extend_start_loc: torch.Tensor = None extend_no_prefix: bool = None @@ -69,8 +70,8 @@ class InputMetadata: # For multimodal pixel_values: List[torch.Tensor] = None - image_sizes: List[List[int]] = None - image_offsets: List[int] = None + image_sizes: List[List[List[int]]] = None + image_offsets: List[List[int]] = None # Trition attention backend triton_max_seq_len: int = 0 @@ -87,20 +88,8 @@ class InputMetadata: def init_multimuldal_info(self, batch: ScheduleBatch): reqs = batch.reqs self.pixel_values = [r.pixel_values for r in reqs] - self.image_sizes = [r.image_size for r in reqs] - self.image_offsets = [] - for r in reqs: - if isinstance(r.image_offset, list): - self.image_offsets.append( - [ - (image_offset - len(r.prefix_indices)) - for image_offset in r.image_offset - ] - ) - elif isinstance(r.image_offset, int): - self.image_offsets.append(r.image_offset - len(r.prefix_indices)) - elif r.image_offset is None: - self.image_offsets.append(0) + self.image_sizes = [r.image_sizes for r in reqs] + self.image_offsets = [r.image_offsets for r in reqs] def compute_positions(self, batch: ScheduleBatch): position_ids_offsets = batch.position_ids_offsets @@ -153,6 +142,7 @@ def compute_extend_infos(self, batch: ScheduleBatch): for i, r in enumerate(batch.reqs) ] self.extend_seq_lens = torch.tensor(extend_lens_cpu, device="cuda") + self.extend_prefix_lens = torch.tensor(batch.prefix_lens_cpu, device="cuda") self.extend_start_loc = torch.zeros_like(self.seq_lens) self.extend_start_loc[1:] = torch.cumsum(self.extend_seq_lens[:-1], dim=0) self.extend_no_prefix = all(l == 0 for l in batch.prefix_lens_cpu) @@ -238,10 +228,10 @@ def init_flashinfer_handlers( prefix_lens_cpu, flashinfer_use_ragged, ): - if self.forward_mode != ForwardMode.DECODE: - prefix_lens = torch.tensor(prefix_lens_cpu, device="cuda") - else: + if self.forward_mode == ForwardMode.DECODE: prefix_lens = None + else: + prefix_lens = self.extend_prefix_lens update_flashinfer_indices( self.forward_mode, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index abee152d6f..8ef47a530f 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -50,7 +50,7 @@ MLATokenToKVPool, ReqToTokenPool, ) -from sglang.srt.model_config import AttentionArch +from sglang.srt.model_config import AttentionArch, ModelConfig from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( @@ -69,7 +69,7 @@ class ModelRunner: def __init__( self, - model_config, + model_config: ModelConfig, mem_fraction_static: float, gpu_id: int, tp_rank: int, @@ -85,7 +85,9 @@ def __init__( self.tp_size = tp_size self.nccl_port = nccl_port self.server_args = server_args - self.is_multimodal_model = is_multimodal_model(self.model_config) + self.is_multimodal_model = is_multimodal_model( + self.model_config.hf_config.architectures + ) global_server_args_dict.update( { "disable_flashinfer": server_args.disable_flashinfer, @@ -95,6 +97,13 @@ def __init__( } ) + if self.is_multimodal_model: + logger.info( + "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models." + ) + server_args.chunked_prefill_size = None + server_args.mem_fraction_static *= 0.95 + min_per_gpu_memory = self.init_torch_distributed() self.load_model() self.init_memory_pool( @@ -507,9 +516,9 @@ def init_cuda_graphs(self): raise Exception( f"Capture cuda graph failed: {e}\n" "Possible solutions:\n" - "1. disable torch compile by not using --enable-torch-compile\n" - "2. disable cuda graph by --disable-cuda-graph\n" - "3. set --mem-fraction-static to a smaller value\n" + "1. disable cuda graph by --disable-cuda-graph\n" + "2. set --mem-fraction-static to a smaller value\n" + "3. disable torch compile by not using --enable-torch-compile\n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" ) diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index 0a22f994bb..b38b62fafd 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -17,7 +17,7 @@ # Adapted from # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, Optional, Tuple import torch from torch import nn diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 4a0a08bf88..daf6f25da1 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -273,9 +273,9 @@ def forward( ) -> torch.Tensor: if input_embeds is None: hidden_states = self.embed_tokens(input_ids) + hidden_states.mul_(self.config.embedding_multiplier_scale) else: hidden_states = input_embeds - hidden_states.mul_(self.config.embedding_multiplier_scale) for i in range(len(self.layers)): hidden_states = self.layers[i](positions, hidden_states, input_metadata) @@ -284,7 +284,7 @@ def forward( return hidden_states -class Grok1ModelForCausalLM(nn.Module): +class Grok1ForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, @@ -415,4 +415,10 @@ def _prepare_presharded_weights( return hf_folder, hf_weights_files, use_safetensors -EntryClass = Grok1ModelForCausalLM +class Grok1ModelForCausalLM(Grok1ForCausalLM): + """An alias for backward-compatbility.""" + + pass + + +EntryClass = [Grok1ForCausalLM, Grok1ModelForCausalLM] diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index 9de8d33c5c..fe75916a43 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -357,6 +357,9 @@ def load_weights_per_param(name, loaded_weight): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. return + if name.startswith("model.vision_tower") and name not in params_dict: + return + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -364,8 +367,6 @@ def load_weights_per_param(name, loaded_weight): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if name.startswith("model.vision_tower") and name not in params_dict: - continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -374,8 +375,6 @@ def load_weights_per_param(name, loaded_weight): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: return - if name.startswith("model.vision_tower") and name not in params_dict: - return param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index 02224971d6..c5effbfc9c 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -103,8 +103,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if name.startswith("model.vision_tower") and name not in params_dict: - continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -113,8 +111,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if name.startswith("model.vision_tower") and name not in params_dict: - continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/python/sglang/srt/models/llama_embedding.py b/python/sglang/srt/models/llama_embedding.py index dfff53cbcd..e4e9174f14 100644 --- a/python/sglang/srt/models/llama_embedding.py +++ b/python/sglang/srt/models/llama_embedding.py @@ -57,6 +57,9 @@ def load_weights_per_param(name, loaded_weight): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. return + if name.startswith("model.vision_tower") and name not in params_dict: + return + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -64,8 +67,6 @@ def load_weights_per_param(name, loaded_weight): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if name.startswith("model.vision_tower") and name not in params_dict: - continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -74,8 +75,6 @@ def load_weights_per_param(name, loaded_weight): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: return - if name.startswith("model.vision_tower") and name not in params_dict: - return param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index 76a0630fc2..bc522bec9c 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -28,7 +28,6 @@ LlavaConfig, MistralConfig, Qwen2Config, - SiglipVisionConfig, SiglipVisionModel, ) from transformers.models.llava.modeling_llava import LlavaMultiModalProjector @@ -66,13 +65,18 @@ def __init__( torch.empty(config.text_config.hidden_size, dtype=torch.float16) ) - def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None): - + def pad_input_ids( + self, + input_ids: List[int], + pad_value: List[int], + pixel_values: List, + image_sizes: List[List[int]], + ): # hardcode for spatial_unpad + anyres - image_aspect_ratio = "anyres" if len(image_size) == 1 else "pad" + image_aspect_ratio = "anyres" if len(image_sizes) == 1 else "pad" offset_list = [] - for image_s in image_size: - if len(image_size) > 16: + for image_s in image_sizes: + if len(image_sizes) > 16: # 2x2 pooling with stride 2 new_image_feature_len = ( math.ceil(self.image_size / self.patch_size / 2) ** 2 @@ -153,17 +157,15 @@ def forward( if input_metadata.forward_mode == ForwardMode.EXTEND: bs = input_metadata.batch_size - # Embed text input + # Embed text inputs input_embeds = self.language_model.model.embed_tokens(input_ids) - # Embed vision input - need_vision = ( - (positions[input_metadata.extend_start_loc] < self.image_feature_len) - .cpu() - .numpy() + + # Whether the requests need vision inputs + max_image_offset = np.array( + [max(image_offsets[i]) if image_offsets[i] else -1 for i in range(bs)] ) - # FIXME: We need to substract the length of the system prompt - has_pixel = np.array([pixel_values[i] is not None for i in range(bs)]) - need_vision = need_vision & has_pixel + start_positions = positions[input_metadata.extend_start_loc].cpu().numpy() + need_vision = start_positions <= max_image_offset if need_vision.any(): pixel_values = [pixel_values[i] for i in range(bs) if need_vision[i]] @@ -332,31 +334,35 @@ def forward( new_image_features.append(image_feature) image_features = new_image_features + # Fill in the placeholder for the image extend_start_loc_cpu = input_metadata.extend_start_loc.cpu().numpy() + prefix_lens_cpu = input_metadata.extend_prefix_lens.cpu().numpy() pt = 0 for i in range(bs): if not need_vision[i]: continue start_idx = extend_start_loc_cpu[i] - pad_dim = image_features[pt].shape[-1] # 576, 4096 - dim = input_embeds.shape[1] - assert ( - pad_dim == dim - ), "invalid pad_dim={}, input_embed_dim={}!".format(pad_dim, dim) - # Fill in the placeholder for the image - try: - for j, image_off in enumerate(image_offsets[i]): - # print("actual image_features length: ", image_features[pt][j].shape[0]) - pad_len = image_features[pt][j].shape[0] - input_embeds[ - start_idx + image_off : start_idx + image_off + pad_len - ] = image_features[pt][j] - except RuntimeError as e: - print(f"RuntimeError in llava image encoding: {e}") - print(image_features[pt].shape) - print(input_embeds.shape) - print(start_idx, image_offsets[i]) + prefix_len = prefix_lens_cpu[i] + + # Multiple images + for j, image_offset in enumerate(image_offsets[i]): + if image_offset < prefix_len: + continue + + tmp_image_feature = image_features[pt][j] + pad_len = tmp_image_feature.shape[0] + + left_idx = start_idx + (image_offset - prefix_len) + right_idx = start_idx + (image_offset - prefix_len) + pad_len + try: + input_embeds[left_idx:right_idx] = tmp_image_feature + except RuntimeError as e: + print(f"RuntimeError in image encoding: {e}") + print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}") + print( + f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}" + ) pt += 1 return self.language_model( @@ -366,8 +372,9 @@ def forward( return self.language_model(input_ids, positions, input_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - # load clip vision model by cfg['mm_vision_tower']: - # huggingface_name or path_of_clip_relative_to_llava_model_dir + # Load clip vision model by cfg['mm_vision_tower']: + # huggingface_name or path_of_clip_relative_to_llava_model_dir + # We put the initialization here instead of __init__ to allow it being reused by other subclasses. vision_path = self.config.mm_vision_tower if "clip" in vision_path: self.vision_tower = CLIPVisionModel.from_pretrained( @@ -422,8 +429,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # load language model self.language_model.load_weights(weights) - monkey_path_clip_vision_embed_forward() - @property def num_patches_per_side(self): return self.image_size // self.patch_size @@ -495,36 +500,4 @@ def __init__( ) -first_call = True - - -def clip_vision_embed_forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] - - # Move this conv layer to CPU to avoid a bug in torch >= 2.1 on A10G. - global first_call - if first_call: - self.patch_embedding.cpu().float() - first_call = False - pixel_values = pixel_values.to(dtype=torch.float32, device="cpu") - patch_embeds = self.patch_embedding(pixel_values).cuda().half() - - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - - class_embeds = self.class_embedding.expand(batch_size, 1, -1) - embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings - - -def monkey_path_clip_vision_embed_forward(): - import transformers - - setattr( - transformers.models.clip.modeling_clip.CLIPVisionEmbeddings, - "forward", - clip_vision_embed_forward, - ) - - EntryClass = [LlavaLlamaForCausalLM, LlavaQwenForCausalLM, LlavaMistralForCausalLM] diff --git a/python/sglang/srt/models/llavavid.py b/python/sglang/srt/models/llavavid.py index 8b81251d69..44e400ff6a 100644 --- a/python/sglang/srt/models/llavavid.py +++ b/python/sglang/srt/models/llavavid.py @@ -26,11 +26,6 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.mm_utils import ( - get_anyres_image_grid_shape, - unpad_image, - unpad_image_shape, -) from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata from sglang.srt.models.llama2 import LlamaForCausalLM @@ -59,23 +54,14 @@ def __init__( torch.empty(config.text_config.hidden_size, dtype=torch.float16) ) - def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None): + def pad_input_ids( + self, + input_ids: List[int], + pad_value: List[int], + pixel_values: List, + image_sizes: List[List[int]], + ): new_image_feature_len = self.image_feature_len - # now only support spatial_unpad + anyres - # if self.mm_patch_merge_type.startswith("spatial"): - # height = width = self.num_patches_per_side - # if pt_shape[0] > 1: - # if self.image_aspect_ratio == "anyres": - # num_patch_width, num_patch_height = get_anyres_image_grid_shape( - # image_size, - # self.image_grid_pinpoints, - # self.vision_tower.config.image_size, - # ) - # if "unpad" in self.mm_patch_merge_type: - # h = num_patch_height * height - # w = num_patch_width * width - # new_h, new_w = unpad_image_shape(h, w, image_size) - # new_image_feature_len += new_h * (new_w + 1) pad_ids = pad_value * ( (new_image_feature_len + len(pad_value)) // len(pad_value) @@ -87,7 +73,7 @@ def pad_input_ids(self, input_ids, pad_value, pt_shape=None, image_size=None): + pad_ids[:new_image_feature_len] + input_ids[offset + 1 :] ) - return new_input_ids, offset + return new_input_ids, [offset] def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor: image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) @@ -133,22 +119,18 @@ def forward( if input_metadata.forward_mode == ForwardMode.EXTEND: bs = input_metadata.batch_size - # Embed text input + # Embed text inputs input_embeds = self.language_model.model.embed_tokens(input_ids) - # Embed vision input - need_vision = ( - (positions[input_metadata.extend_start_loc] < self.image_feature_len) - .cpu() - .numpy() + # Whether the requests need vision inputs + max_image_offset = np.array( + [max(image_offsets[i]) if image_offsets[i] else -1 for i in range(bs)] ) - # FIXME: We need to substract the length of the system prompt - has_pixel = np.array([pixel_values[i] is not None for i in range(bs)]) - need_vision = need_vision & has_pixel + start_positions = positions[input_metadata.extend_start_loc].cpu().numpy() + need_vision = start_positions <= max_image_offset if need_vision.any(): pixel_values = [pixel_values[i] for i in range(bs) if need_vision[i]] - image_sizes = [image_sizes[i] for i in range(bs) if need_vision[i]] ########## Encode Image ######## @@ -183,31 +165,36 @@ def forward( new_image_features.append(image_feature.flatten(0, 1)) image_features = new_image_features + # Fill in the placeholder for the image extend_start_loc_cpu = input_metadata.extend_start_loc.cpu().numpy() + prefix_lens_cpu = input_metadata.extend_prefix_lens.cpu().numpy() pt = 0 for i in range(bs): if not need_vision[i]: continue start_idx = extend_start_loc_cpu[i] - pad_len, pad_dim = image_features[pt].shape # 576, 4096 - dim = input_embeds.shape[1] - assert ( - pad_dim == dim - ), "invalid pad_dim={}, input_embed_dim={}!".format(pad_dim, dim) - # Fill in the placeholder for the image - try: - input_embeds[ - start_idx - + image_offsets[i] : start_idx - + image_offsets[i] - + pad_len - ] = image_features[pt] - except RuntimeError as e: - print(f"RuntimeError in llava image encoding: {e}") - print(input_embeds.shape) - print(start_idx, image_offsets[i]) - pt += 1 + prefix_len = prefix_lens_cpu[i] + + # Multiple images + for image_offset in image_offsets[i]: + if image_offset < prefix_len: + continue + + tmp_image_feature = image_features[pt] + pad_len = tmp_image_feature.shape[0] + + left_idx = start_idx + (image_offset - prefix_len) + right_idx = start_idx + (image_offset - prefix_len) + pad_len + try: + input_embeds[left_idx:right_idx] = tmp_image_feature + except RuntimeError as e: + print(f"RuntimeError in image encoding: {e}") + print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}") + print( + f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}" + ) + pt += 1 return self.language_model( input_ids, positions, input_metadata, input_embeds=input_embeds @@ -216,8 +203,9 @@ def forward( return self.language_model(input_ids, positions, input_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - # load clip vision model by cfg['mm_vision_tower']: - # huggingface_name or path_of_clip_relative_to_llava_model_dir + # Load clip vision model by cfg['mm_vision_tower']: + # huggingface_name or path_of_clip_relative_to_llava_model_dir + # We put the initialization here instead of __init__ to allow it being reused by other subclasses. vision_path = self.config.mm_vision_tower self.vision_tower = CLIPVisionModel.from_pretrained( vision_path, torch_dtype=torch.float16 @@ -271,43 +259,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # load language model self.language_model.load_weights(weights) - monkey_path_clip_vision_embed_forward() - @property def num_patches_per_side(self): return self.image_size // self.patch_size -first_call = True - - -def clip_vision_embed_forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - batch_size = pixel_values.shape[0] - - # Move this conv layer to CPU to avoid a bug in torch >= 2.1 on A10G. - global first_call - if first_call: - self.patch_embedding.cpu().float() - first_call = False - pixel_values = pixel_values.to(dtype=torch.float32, device="cpu") - patch_embeds = self.patch_embedding(pixel_values).cuda().half() - - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - - class_embeds = self.class_embedding.expand(batch_size, 1, -1) - embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings - - -def monkey_path_clip_vision_embed_forward(): - import transformers - - setattr( - transformers.models.clip.modeling_clip.CLIPVisionEmbeddings, - "forward", - clip_vision_embed_forward, - ) - - EntryClass = LlavaVidForCausalLM diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index fcf083e1b5..a0c54f6910 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -312,6 +312,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue + if name.startswith("model.vision_tower") and name not in params_dict: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -319,8 +322,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if name.startswith("model.vision_tower") and name not in params_dict: - continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -329,8 +330,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if name.startswith("model.vision_tower") and name not in params_dict: - continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py index 11d4cda1c0..0f86206d82 100644 --- a/python/sglang/srt/models/yivl.py +++ b/python/sglang/srt/models/yivl.py @@ -24,10 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.models.llava import ( - LlavaLlamaForCausalLM, - monkey_path_clip_vision_embed_forward, -) +from sglang.srt.models.llava import LlavaLlamaForCausalLM class YiVLForCausalLM(LlavaLlamaForCausalLM): @@ -50,7 +47,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self.config._name_or_path, torch_dtype=torch.float16, subfolder=self.vision_tower_subfolder, - ).cuda() + ).to("cuda") self.vision_tower.eval() @@ -94,8 +91,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # load language model self.language_model.load_weights(weights) - monkey_path_clip_vision_embed_forward() - class YiVLMultiModalProjector(nn.Module): def __init__(self, config: LlavaConfig): diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index f3d1ab0f94..9c36216ede 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -335,12 +335,12 @@ def launch_server( pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False) if server_args.dp_size == 1: - start_process = start_controller_process_single + start_controller_process = start_controller_process_single else: - start_process = start_controller_process_multi + start_controller_process = start_controller_process_multi proc_controller = mp.Process( - target=start_process, + target=start_controller_process, args=(server_args, port_args, pipe_controller_writer, model_overide_args), ) proc_controller.start() @@ -421,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs): if not server_args.disable_flashinfer: assert_pkg_version( "flashinfer", - "0.1.6", + "0.1.5", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index a6e710009f..b7bb657306 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -26,7 +26,7 @@ import time from importlib.metadata import PackageNotFoundError, version from io import BytesIO -from typing import List, Optional +from typing import List, Optional, Union import numpy as np import psutil @@ -193,35 +193,16 @@ def allocate_init_ports( return ret_ports[0], ret_ports[1:num_ports_needed] -def get_int_token_logit_bias(tokenizer, vocab_size): - """Get the logit bias for integer-only tokens.""" - # a bug when model's vocab size > tokenizer.vocab_size - if tokenizer == None: - return [-1e5] * vocab_size - vocab_size = tokenizer.vocab_size - logit_bias = np.zeros(vocab_size, dtype=np.float32) - for t_id in range(vocab_size): - ss = tokenizer.decode([t_id]).strip() - if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id): - logit_bias[t_id] = -1e5 - - return logit_bias - - -def is_multimodal_model(model): - from sglang.srt.model_config import ModelConfig - - if isinstance(model, str): - model = model.lower() - return "llava" in model or "yi-vl" in model or "llava-next" in model - - if isinstance(model, ModelConfig): - model_path = model.path.lower() - return ( - "llava" in model_path or "yi-vl" in model_path or "llava-next" in model_path - ) - - raise ValueError("unrecognized type") +def is_multimodal_model(model_architectures): + if ( + "LlavaLlamaForCausalLM" in model_architectures + or "LlavaQwenForCausalLM" in model_architectures + or "LlavaMistralForCausalLM" in model_architectures + or "LlavaVidForCausalLM" in model_architectures + ): + return True + else: + return False def is_generation_model(model_architectures, is_embedding: bool = False): @@ -317,12 +298,14 @@ def decode_video_base64(video_base64): ) # Return an empty array and size tuple if no frames were found -def load_image(image_file): +def load_image(image_file: Union[str, bytes]): from PIL import Image image = image_size = None - if image_file.startswith("http://") or image_file.startswith("https://"): + if isinstance(image_file, bytes): + image = Image.open(BytesIO(image_file)) + elif image_file.startswith("http://") or image_file.startswith("https://"): timeout = int(os.getenv("REQUEST_TIMEOUT", "3")) response = requests.get(image_file, timeout=timeout) image = Image.open(BytesIO(response.content)) @@ -334,8 +317,10 @@ def load_image(image_file): elif image_file.startswith("video:"): image_file = image_file.replace("video:", "") image, image_size = decode_video_base64(image_file) - else: + elif isinstance(image_file, str): image = Image.open(BytesIO(base64.b64decode(image_file))) + else: + raise ValueError(f"Invalid image: {image}") return image, image_size diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0003e4776a..cf29c0e815 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -32,8 +32,6 @@ def setUpClass(cls): other_args=[ "--chat-template", "chatml-llava", - "--chunked-prefill-size", - "16384", # "--log-requests", ], ) From c411f32e1c9b551011a52566b5afae1320a99fde Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 29 Aug 2024 00:07:02 +1000 Subject: [PATCH 109/118] feat: replace GeluAndMul (#1234) --- python/sglang/srt/layers/activation.py | 14 ++++++++++---- python/sglang/srt/models/gemma.py | 4 ++-- test/srt/models/test_generation_models.py | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 5df387cb2b..9047197af2 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -18,7 +18,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul +from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul from vllm.distributed import ( divide, get_tensor_model_parallel_rank, @@ -43,18 +43,24 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: class GeluAndMul(CustomOp): - def __init__(self, **kwargs): + def __init__(self, approximate="tanh"): super().__init__() + self.approximate = approximate def forward_native(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 - return F.gelu(x[..., :d], approximate="tanh") * x[..., d:] + return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 output_shape = x.shape[:-1] + (d,) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - gelu_tanh_and_mul(x, out) + if self.approximate == "tanh": + gelu_tanh_and_mul(x, out) + elif self.approximate == "none": + gelu_and_mul(x, out) + else: + raise RuntimeError("GeluAndMul only support tanh or none") return out diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 990937f518..ae3b1b1948 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -23,7 +23,6 @@ from transformers import PretrainedConfig from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, @@ -34,6 +33,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention @@ -60,7 +60,7 @@ def __init__( bias=False, quant_config=quant_config, ) - self.act_fn = GeluAndMul() + self.act_fn = GeluAndMul("none") def forward(self, x): gate_up, _ = self.gate_up_proj(x) diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index e38584741e..08288c510c 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -96,7 +96,7 @@ def assert_close_prefill_logits_and_output_strs( if hf_logprobs.shape[0] <= 100: assert torch.all( abs(hf_logprobs - srt_logprobs) < prefill_tolerance - ), "prefill logprobs are not all close" + ), f"prefill logprobs are not all close with model_path={model_path} prompts={prompts} prefill_tolerance={prefill_tolerance}" print(f"hf_outputs.output_strs={hf_outputs.output_strs}") print(f"srt_outputs.output_strs={srt_outputs.output_strs}") From 0a97d7962d31728a3e4d5936b105ab27a83cd1a9 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 28 Aug 2024 08:38:50 -0700 Subject: [PATCH 110/118] [Fix] Fix OOM in llava base class (#1249) --- .../http_llava_onevision_test.py | 2 +- python/sglang/srt/models/llava.py | 51 ++++++++++--------- python/sglang/srt/server.py | 2 +- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/examples/runtime/llava_onevision/http_llava_onevision_test.py b/examples/runtime/llava_onevision/http_llava_onevision_test.py index 41d60b12af..0c93d2ce2b 100644 --- a/examples/runtime/llava_onevision/http_llava_onevision_test.py +++ b/examples/runtime/llava_onevision/http_llava_onevision_test.py @@ -1,7 +1,7 @@ """ Usage: -python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384 +python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava python3 http_llava_onevision_test.py """ diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index bc522bec9c..7dcf5348b0 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -46,25 +46,7 @@ from sglang.srt.models.qwen2 import Qwen2ForCausalLM -class LlavaLlamaForCausalLM(nn.Module): - def __init__( - self, - config: LlavaConfig, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.vision_tower = None - self.config.vision_config.hidden_size = config.mm_hidden_size - self.config.text_config.hidden_size = config.hidden_size - self.multi_modal_projector = LlavaMultiModalProjector(config) - self.language_model = LlamaForCausalLM(config, quant_config=quant_config) - if "unpad" in getattr(config, "mm_patch_merge_type", ""): - self.language_model.model.image_newline = nn.Parameter( - torch.empty(config.text_config.hidden_size, dtype=torch.float16) - ) - +class LlavaBaseForCausalLM(nn.Module): def pad_input_ids( self, input_ids: List[int], @@ -434,14 +416,36 @@ def num_patches_per_side(self): return self.image_size // self.patch_size -class LlavaQwenForCausalLM(LlavaLlamaForCausalLM): +class LlavaLlamaForCausalLM(LlavaBaseForCausalLM): + def __init__( + self, + config: LlavaConfig, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, + ) -> None: + super().__init__() + + self.config = config + self.vision_tower = None + self.config.vision_config.hidden_size = config.mm_hidden_size + self.config.text_config.hidden_size = config.hidden_size + self.multi_modal_projector = LlavaMultiModalProjector(config) + self.language_model = LlamaForCausalLM(config, quant_config=quant_config) + if "unpad" in getattr(config, "mm_patch_merge_type", ""): + self.language_model.model.image_newline = nn.Parameter( + torch.empty(config.text_config.hidden_size, dtype=torch.float16) + ) + + +class LlavaQwenForCausalLM(LlavaBaseForCausalLM): def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, cache_config: Optional[CacheConfig] = None, ) -> None: - super().__init__(config, quant_config=quant_config, cache_config=cache_config) + super().__init__() + self.config = config self.vision_tower = None if getattr(self.config, "vision_config", None) is None: @@ -467,14 +471,15 @@ def __init__( ) -class LlavaMistralForCausalLM(LlavaLlamaForCausalLM): +class LlavaMistralForCausalLM(LlavaBaseForCausalLM): def __init__( self, config: LlavaConfig, quant_config: Optional[QuantizationConfig] = None, cache_config: Optional[CacheConfig] = None, ) -> None: - super().__init__(config, quant_config=quant_config, cache_config=cache_config) + super().__init__() + self.config = config self.vision_tower = None if getattr(self.config, "vision_config", None) is None: diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 9c36216ede..5ba2a45e70 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -421,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs): if not server_args.disable_flashinfer: assert_pkg_version( "flashinfer", - "0.1.5", + "0.1.6", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", From 492143bf32b25848300dcc18bd51fef6c25d02d7 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 29 Aug 2024 04:25:46 +1000 Subject: [PATCH 111/118] fix: resolve qwen2 moe weight loader (#1252) --- python/sglang/srt/models/qwen2_moe.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 9bdbd75066..d5c79a40f0 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -401,24 +401,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] - expert_params_mapping = [ - # These are the weights for the experts - # (param_name, weight_name, expert_id, shard_id) - ( - ( - "experts.w13_weight" - if weight_name in ["gate_proj", "up_proj"] - else "experts.w2_weight" - ), - f"experts.{expert_id}.{weight_name}.weight", - expert_id, - shard_id, - ) - for expert_id in range(self.config.num_experts) - for shard_id, weight_name in enumerate( - ["gate_proj", "down_proj", "up_proj"] - ) - ] + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + ) params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: @@ -458,7 +446,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader( param, loaded_weight, - weight_name, + name, shard_id=shard_id, expert_id=expert_id, ) From 13ac95b8946ff0bc62527567931bdf647cc43c5e Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 29 Aug 2024 04:46:33 +1000 Subject: [PATCH 112/118] chore: bump v0.2.14.post2 (#1250) --- README.md | 2 +- python/pyproject.toml | 2 +- python/sglang/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9d795ce438..8e3e47c100 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ### Method 2: From source ``` # Use the last release branch -git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git +git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip diff --git a/python/pyproject.toml b/python/pyproject.toml index 7b2741fd21..87c99bffae 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.2.14.post1" +version = "0.2.14.post2" description = "SGLang is yet another fast serving framework for large language models and vision language models." readme = "README.md" requires-python = ">=3.8" diff --git a/python/sglang/version.py b/python/sglang/version.py index 839b265519..ad954de503 100644 --- a/python/sglang/version.py +++ b/python/sglang/version.py @@ -1 +1 @@ -__version__ = "0.2.14.post1" +__version__ = "0.2.14.post2" From 6c34d6339c040628e895d167cf22f2ab7104f8b3 Mon Sep 17 00:00:00 2001 From: Enrique Shockwave <33002121+qeternity@users.noreply.github.com> Date: Thu, 29 Aug 2024 02:57:10 +0100 Subject: [PATCH 113/118] make json_schema usable from gen (#1254) --- python/sglang/api.py | 2 ++ python/sglang/lang/interpreter.py | 1 + python/sglang/lang/ir.py | 5 +++++ 3 files changed, 8 insertions(+) diff --git a/python/sglang/api.py b/python/sglang/api.py index 3a2f747bec..9405606b71 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -78,6 +78,7 @@ def gen( choices: Optional[List[str]] = None, choices_method: Optional[ChoicesSamplingMethod] = None, regex: Optional[str] = None, + json_schema: Optional[str] = None, ): """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md""" @@ -114,6 +115,7 @@ def gen( return_text_in_logprobs, dtype, regex, + json_schema, ) diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index 306d280c7f..91f48456aa 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -673,6 +673,7 @@ def _resolve_sampling_params(self, sampling_params): "return_text_in_logprobs", "dtype", "regex", + "json_schema", ]: value = getattr(sampling_params, item, None) if value is not None: diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 199a7ac7a4..99a3e8e68b 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -30,6 +30,7 @@ class SglSamplingParams: logprob_start_len: Optional[int] = (None,) top_logprobs_num: Optional[int] = (None,) return_text_in_logprobs: Optional[bool] = (None,) + json_schema: Optional[str] = None # for constrained generation, not included in to_xxx_kwargs dtype: Optional[str] = None @@ -51,6 +52,7 @@ def clone(self): self.logprob_start_len, self.top_logprobs_num, self.return_text_in_logprobs, + self.json_schema, ) def to_openai_kwargs(self): @@ -121,6 +123,7 @@ def to_srt_kwargs(self): "presence_penalty": self.presence_penalty, "ignore_eos": self.ignore_eos, "regex": self.regex, + "json_schema": self.json_schema, } @@ -425,6 +428,7 @@ def __init__( return_text_in_logprobs: Optional[bool] = None, dtype: Optional[type] = None, regex: Optional[str] = None, + json_schema: Optional[str] = None, ): """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md""" super().__init__() @@ -446,6 +450,7 @@ def __init__( return_text_in_logprobs=return_text_in_logprobs, dtype=dtype, regex=regex, + json_schema=json_schema, ) def __repr__(self): From 8153168c96c76cdc77eabcbe03b167f9f3b4385f Mon Sep 17 00:00:00 2001 From: Zhiqiang Xie Date: Wed, 28 Aug 2024 18:57:54 -0700 Subject: [PATCH 114/118] fix data racing due to mutable reference using deepcopy (#1255) --- python/sglang/srt/managers/io_struct.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 3f80c64cf9..5b91ff62e9 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -18,8 +18,9 @@ processes (TokenizerManager, DetokenizerManager, Controller). """ +import copy import uuid -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Dict, List, Optional, Union from sglang.srt.managers.schedule_batch import BaseFinishReason @@ -249,6 +250,10 @@ class BatchTokenIDOut: meta_info: List[Dict] finished_reason: List[BaseFinishReason] + def __post_init__(self): + # deepcopy meta_info to avoid modification in place + self.meta_info = copy.deepcopy(self.meta_info) + @dataclass class BatchStrOut: From 381dd57bd69f027a3298d107d8eb851c3c29d8e4 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Wed, 28 Aug 2024 18:58:52 -0700 Subject: [PATCH 115/118] Sampler cudagraph (#1253) --- python/sglang/bench_latency.py | 14 ++-- python/sglang/srt/layers/logits_processor.py | 8 +- python/sglang/srt/layers/sampler.py | 83 +++++++++++++++---- python/sglang/srt/managers/schedule_batch.py | 28 +++++-- python/sglang/srt/managers/tp_worker.py | 52 +++++++----- .../srt/model_executor/cuda_graph_runner.py | 33 ++++++-- .../srt/model_executor/forward_batch_info.py | 7 ++ .../sglang/srt/model_executor/model_runner.py | 14 +++- python/sglang/srt/models/chatglm.py | 16 +--- python/sglang/srt/models/commandr.py | 6 +- python/sglang/srt/models/dbrx.py | 6 +- python/sglang/srt/models/deepseek.py | 6 +- python/sglang/srt/models/deepseek_v2.py | 6 +- python/sglang/srt/models/gemma.py | 6 +- python/sglang/srt/models/gemma2.py | 6 +- python/sglang/srt/models/gpt_bigcode.py | 6 +- python/sglang/srt/models/grok.py | 6 +- python/sglang/srt/models/internlm2.py | 6 +- python/sglang/srt/models/llama2.py | 10 ++- .../sglang/srt/models/llama_classification.py | 4 +- python/sglang/srt/models/minicpm.py | 6 +- python/sglang/srt/models/mixtral.py | 6 +- python/sglang/srt/models/mixtral_quant.py | 6 +- python/sglang/srt/models/qwen.py | 7 +- python/sglang/srt/models/qwen2.py | 8 +- python/sglang/srt/models/qwen2_moe.py | 19 ++--- python/sglang/srt/models/stablelm.py | 6 +- .../srt/sampling/sampling_batch_info.py | 75 ++++++++++++++++- python/sglang/test/runners.py | 2 +- 29 files changed, 342 insertions(+), 116 deletions(-) diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index dea910f577..3a48740857 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -200,16 +200,16 @@ def extend(reqs, model_runner): tree_cache=None, ) batch.prepare_for_extend(model_runner.model_config.vocab_size) - output = model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample(output.next_token_logits) - return next_token_ids, output.next_token_logits, batch + sample_output, logits_output = model_runner.forward(batch, ForwardMode.EXTEND) + next_token_ids = sample_output.batch_next_token_ids.tolist() + return next_token_ids, logits_output.next_token_logits, batch def decode(input_token_ids, batch, model_runner): - batch.prepare_for_decode(input_token_ids.cpu().numpy()) - output = model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample(output.next_token_logits) - return next_token_ids, output.next_token_logits + batch.prepare_for_decode(input_token_ids) + sample_output, logits_output = model_runner.forward(batch, ForwardMode.DECODE) + next_token_ids = sample_output.batch_next_token_ids.tolist() + return next_token_ids, logits_output.next_token_logits @torch.inference_mode() diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index 63f74d8b02..b81f3d2a04 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -29,7 +29,7 @@ @dataclasses.dataclass -class LogitProcessorOutput: +class LogitsProcessorOutput: # The logits of the next tokens. shape: [#seq, vocab_size] next_token_logits: torch.Tensor # The logprobs of the next tokens. shape: [#seq, vocab_size] @@ -185,7 +185,7 @@ def forward( # Return only last_logits if logprob is not requested if not logits_metadata.return_logprob: - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=last_logits, next_token_logprobs=None, normalized_prompt_logprobs=None, @@ -209,7 +209,7 @@ def forward( else: output_top_logprobs = None - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, normalized_prompt_logprobs=None, @@ -278,7 +278,7 @@ def forward( # Remove the last token logprob for the prefill tokens. input_token_logprobs = input_token_logprobs[:-1] - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=last_logits, next_token_logprobs=last_logprobs, normalized_prompt_logprobs=normalized_prompt_logprobs, diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py index 3006e765c8..6cb7d0a7c1 100644 --- a/python/sglang/srt/layers/sampler.py +++ b/python/sglang/srt/layers/sampler.py @@ -1,4 +1,6 @@ +import dataclasses import logging +from typing import Union import torch from flashinfer.sampling import ( @@ -9,6 +11,8 @@ ) from vllm.model_executor.custom_op import CustomOp +from sglang.srt.layers.logits_processor import LogitsProcessorOutput + # TODO: move this dict to another place from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo @@ -16,30 +20,71 @@ logger = logging.getLogger(__name__) +@dataclasses.dataclass +class SampleOutput: + success: torch.Tensor + probs: torch.Tensor + batch_next_token_ids: torch.Tensor + + class Sampler(CustomOp): def __init__(self): super().__init__() - def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): + def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): + # min-token, presence, frequency + if sampling_info.linear_penalties is not None: + logits += sampling_info.linear_penalties + + # repetition + if sampling_info.scaling_penalties is not None: + logits = torch.where( + logits > 0, + logits / sampling_info.scaling_penalties, + logits * sampling_info.scaling_penalties, + ) + + return logits + + def _get_probs( + self, + logits: torch.Tensor, + sampling_info: SamplingBatchInfo, + is_torch_compile: bool = False, + ): # Post process logits logits = logits.contiguous() logits.div_(sampling_info.temperatures) + if is_torch_compile: + # FIXME: Temporary workaround for unknown bugs in torch.compile + logits.add_(0) + if sampling_info.logit_bias is not None: logits.add_(sampling_info.logit_bias) if sampling_info.vocab_mask is not None: logits = logits.masked_fill(~sampling_info.vocab_mask, float("-inf")) - logits = sampling_info.penalizer_orchestrator.apply(logits) + logits = self._apply_penalties(logits, sampling_info) - probs = torch.softmax(logits, dim=-1) + return torch.softmax(logits, dim=-1) + + def forward_cuda( + self, + logits: Union[torch.Tensor, LogitsProcessorOutput], + sampling_info: SamplingBatchInfo, + ): + if isinstance(logits, LogitsProcessorOutput): + logits = logits.next_token_logits + + probs = self._get_probs(logits, sampling_info) if not global_server_args_dict["disable_flashinfer_sampling"]: max_top_k_round, batch_size = 32, probs.shape[0] uniform_samples = torch.rand( (max_top_k_round, batch_size), device=probs.device ) - if sampling_info.min_ps.any(): + if sampling_info.need_min_p_sampling: probs = top_k_renorm_prob(probs, sampling_info.top_ks) probs = top_p_renorm_prob(probs, sampling_info.top_ps) batch_next_token_ids, success = min_p_sampling_from_probs( @@ -55,18 +100,23 @@ def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo): probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps ) - if not torch.all(success): - logging.warning("Sampling failed, fallback to top_k=1 strategy") - probs = probs.masked_fill(torch.isnan(probs), 0.0) - argmax_ids = torch.argmax(probs, dim=-1) - batch_next_token_ids = torch.where( - success, batch_next_token_ids, argmax_ids - ) + return SampleOutput(success, probs, batch_next_token_ids) - return batch_next_token_ids + def forward_native( + self, + logits: Union[torch.Tensor, LogitsProcessorOutput], + sampling_info: SamplingBatchInfo, + ): + if isinstance(logits, LogitsProcessorOutput): + logits = logits.next_token_logits + + probs = self._get_probs(logits, sampling_info, is_torch_compile=True) + + batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch( + probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps + ) - def forward_native(): - raise NotImplementedError("Native forward is not implemented yet.") + return SampleOutput(success, probs, batch_next_token_ids) def top_k_top_p_min_p_sampling_from_probs_torch( @@ -87,7 +137,10 @@ def top_k_top_p_min_p_sampling_from_probs_torch( probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0 probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0]) try: - sampled_index = torch.multinomial(probs_sort, num_samples=1) + # FIXME: torch.multiomial does not support num_samples = 1 + sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[ + :, :1 + ] except RuntimeError as e: logger.warning(f"Sampling error: {e}") batch_next_token_ids = torch.zeros( diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 5554170a35..f5b9c9eb27 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -1,3 +1,5 @@ +from __future__ import annotations + """ Copyright 2023-2024 SGLang Team Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,7 +19,7 @@ import logging from dataclasses import dataclass -from typing import List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union import torch @@ -29,6 +31,10 @@ from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo +if TYPE_CHECKING: + from sglang.srt.layers.sampler import SampleOutput + + INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 # Put some global args for easy access @@ -678,11 +684,17 @@ def merge(self, other: "ScheduleBatch"): self.top_logprobs_nums.extend(other.top_logprobs_nums) self.return_logprob = any(req.return_logprob for req in self.reqs) - def sample(self, logits: torch.Tensor): - from sglang.srt.layers.sampler import Sampler - - sampler = Sampler() - - batch_next_token_ids = sampler(logits, self.sampling_info) + def check_sample_results(self, sample_output: SampleOutput): + if not torch.all(sample_output.success): + probs = sample_output.probs + batch_next_token_ids = sample_output.batch_next_token_ids + logging.warning("Sampling failed, fallback to top_k=1 strategy") + probs = probs.masked_fill(torch.isnan(probs), 0.0) + argmax_ids = torch.argmax(probs, dim=-1) + batch_next_token_ids = torch.where( + sample_output.success, batch_next_token_ids, argmax_ids + ) + sample_output.probs = probs + sample_output.batch_next_token_ids = batch_next_token_ids - return batch_next_token_ids + return sample_output.batch_next_token_ids diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index cd1b580643..123b1f5d5d 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -31,7 +31,7 @@ from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer -from sglang.srt.layers.logits_processor import LogitProcessorOutput +from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.managers.io_struct import ( AbortReq, BatchEmbeddingOut, @@ -504,21 +504,29 @@ def forward_prefill_batch(self, batch: ScheduleBatch): if self.model_runner.is_generation: # Forward and sample the next tokens if batch.extend_num_tokens != 0: - output = self.model_runner.forward(batch, ForwardMode.EXTEND) - next_token_ids = batch.sample(output.next_token_logits) + sample_output, logits_output = self.model_runner.forward( + batch, ForwardMode.EXTEND + ) + next_token_ids = batch.check_sample_results(sample_output) batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( next_token_ids ) # Move logprobs to cpu - if output.next_token_logprobs is not None: - output.next_token_logprobs = output.next_token_logprobs[ - torch.arange(len(next_token_ids), device=next_token_ids.device), - next_token_ids, - ].tolist() - output.input_token_logprobs = output.input_token_logprobs.tolist() - output.normalized_prompt_logprobs = ( - output.normalized_prompt_logprobs.tolist() + if logits_output.next_token_logprobs is not None: + logits_output.next_token_logprobs = ( + logits_output.next_token_logprobs[ + torch.arange( + len(next_token_ids), device=next_token_ids.device + ), + next_token_ids, + ].tolist() + ) + logits_output.input_token_logprobs = ( + logits_output.input_token_logprobs.tolist() + ) + logits_output.normalized_prompt_logprobs = ( + logits_output.normalized_prompt_logprobs.tolist() ) next_token_ids = next_token_ids.tolist() @@ -557,12 +565,14 @@ def forward_prefill_batch(self, batch: ScheduleBatch): self.req_to_token_pool.free(req.req_pool_idx) if req.return_logprob: - self.add_logprob_return_values(i, req, pt, next_token_ids, output) + self.add_logprob_return_values( + i, req, pt, next_token_ids, logits_output + ) pt += req.extend_input_len else: assert batch.extend_num_tokens != 0 - output = self.model_runner.forward(batch, ForwardMode.EXTEND) - embeddings = output.embeddings.tolist() + logits_output = self.model_runner.forward(batch, ForwardMode.EXTEND) + embeddings = logits_output.embeddings.tolist() # Check finish conditions for i, req in enumerate(batch.reqs): @@ -590,7 +600,7 @@ def add_logprob_return_values( req: Req, pt: int, next_token_ids: List[int], - output: LogitProcessorOutput, + output: LogitsProcessorOutput, ): if req.normalized_prompt_logprob is None: req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i] @@ -672,15 +682,17 @@ def forward_decode_batch(self, batch: ScheduleBatch): batch.prepare_for_decode() # Forward and sample the next tokens - output = self.model_runner.forward(batch, ForwardMode.DECODE) - next_token_ids = batch.sample(output.next_token_logits) + sample_output, logits_output = self.model_runner.forward( + batch, ForwardMode.DECODE + ) + next_token_ids = batch.check_sample_results(sample_output) batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens( next_token_ids ) # Move logprobs to cpu - if output.next_token_logprobs is not None: - next_token_logprobs = output.next_token_logprobs[ + if logits_output.next_token_logprobs is not None: + next_token_logprobs = logits_output.next_token_logprobs[ torch.arange(len(next_token_ids), device=next_token_ids.device), next_token_ids, ].tolist() @@ -706,7 +718,7 @@ def forward_decode_batch(self, batch: ScheduleBatch): (next_token_logprobs[i], next_token_id) ) if req.top_logprobs_num > 0: - req.output_top_logprobs.append(output.output_top_logprobs[i]) + req.output_top_logprobs.append(logits_output.output_top_logprobs[i]) self.handle_finished_requests(batch) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 796db26623..40c87af88c 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -26,16 +26,18 @@ from vllm.model_executor.custom_op import CustomOp from sglang.srt.layers.logits_processor import ( - LogitProcessorOutput, LogitsMetadata, LogitsProcessor, + LogitsProcessorOutput, ) +from sglang.srt.layers.sampler import SampleOutput from sglang.srt.managers.schedule_batch import ScheduleBatch from sglang.srt.model_executor.forward_batch_info import ( ForwardMode, InputMetadata, update_flashinfer_indices, ) +from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo from sglang.srt.utils import monkey_patch_vllm_all_gather @@ -144,6 +146,10 @@ def __init__( self.flashinfer_kv_indices.clone(), ] + # Sampling inputs + vocab_size = model_runner.model_config.vocab_size + self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size) + self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else [] if use_torch_compile: @@ -235,6 +241,7 @@ def capture_one_batch_size(self, bs: int, forward: Callable): def run_once(): input_metadata = InputMetadata( forward_mode=ForwardMode.DECODE, + sampling_info=self.sampling_info[:bs], batch_size=bs, req_pool_indices=req_pool_indices, seq_lens=seq_lens, @@ -299,27 +306,35 @@ def replay(self, batch: ScheduleBatch): self.flashinfer_handlers[bs], ) + # Sampling inputs + self.sampling_info.inplace_assign(raw_bs, batch.sampling_info) + # Replay torch.cuda.synchronize() self.graphs[bs].replay() torch.cuda.synchronize() - output = self.output_buffers[bs] + sample_output, logits_output = self.output_buffers[bs] # Unpad if bs != raw_bs: - output = LogitProcessorOutput( - next_token_logits=output.next_token_logits[:raw_bs], + logits_output = LogitsProcessorOutput( + next_token_logits=logits_output.next_token_logits[:raw_bs], next_token_logprobs=None, normalized_prompt_logprobs=None, input_token_logprobs=None, input_top_logprobs=None, output_top_logprobs=None, ) + sample_output = SampleOutput( + sample_output.success[:raw_bs], + sample_output.probs[:raw_bs], + sample_output.batch_next_token_ids[:raw_bs], + ) # Extract logprobs if batch.return_logprob: - output.next_token_logprobs = torch.nn.functional.log_softmax( - output.next_token_logits, dim=-1 + logits_output.next_token_logprobs = torch.nn.functional.log_softmax( + logits_output.next_token_logits, dim=-1 ) return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums) if return_top_logprob: @@ -327,8 +342,8 @@ def replay(self, batch: ScheduleBatch): forward_mode=ForwardMode.DECODE, top_logprobs_nums=batch.top_logprobs_nums, ) - output.output_top_logprobs = LogitsProcessor.get_top_logprobs( - output.next_token_logprobs, logits_metadata + logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs( + logits_output.next_token_logprobs, logits_metadata )[1] - return output + return sample_output, logits_output diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py index f24cdf6b72..3d40c9d755 100644 --- a/python/sglang/srt/model_executor/forward_batch_info.py +++ b/python/sglang/srt/model_executor/forward_batch_info.py @@ -1,3 +1,5 @@ +from __future__ import annotations + """ Copyright 2023-2024 SGLang Team Licensed under the Apache License, Version 2.0 (the "License"); @@ -26,6 +28,7 @@ if TYPE_CHECKING: from sglang.srt.model_executor.model_runner import ModelRunner + from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo class ForwardMode(IntEnum): @@ -42,6 +45,7 @@ class InputMetadata: """Store all inforamtion of a forward pass.""" forward_mode: ForwardMode + sampling_info: SamplingBatchInfo batch_size: int req_pool_indices: torch.Tensor seq_lens: torch.Tensor @@ -169,6 +173,7 @@ def from_schedule_batch( ): ret = cls( forward_mode=forward_mode, + sampling_info=batch.sampling_info, batch_size=batch.batch_size(), req_pool_indices=batch.req_pool_indices, seq_lens=batch.seq_lens, @@ -179,6 +184,8 @@ def from_schedule_batch( top_logprobs_nums=batch.top_logprobs_nums, ) + ret.sampling_info.prepare_penalties() + ret.compute_positions(batch) ret.compute_extend_infos(batch) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 8ef47a530f..e6f5e74311 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -21,7 +21,7 @@ import logging import pkgutil from functools import lru_cache -from typing import Optional, Type +from typing import Optional, Tuple, Type import torch import torch.nn as nn @@ -44,6 +44,8 @@ from vllm.model_executor.models import ModelRegistry from sglang.global_config import global_config +from sglang.srt.layers.logits_processor import LogitsProcessorOutput +from sglang.srt.layers.sampler import SampleOutput from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict from sglang.srt.mem_cache.memory_pool import ( MHATokenToKVPool, @@ -524,7 +526,11 @@ def init_cuda_graphs(self): @torch.inference_mode() def forward_decode(self, batch: ScheduleBatch): - if self.cuda_graph_runner and self.cuda_graph_runner.can_run(len(batch.reqs)): + if ( + self.cuda_graph_runner + and self.cuda_graph_runner.can_run(len(batch.reqs)) + and not batch.sampling_info.has_bias() + ): return self.cuda_graph_runner.replay(batch) input_metadata = InputMetadata.from_schedule_batch( @@ -573,7 +579,9 @@ def forward_extend_multi_modal(self, batch: ScheduleBatch): input_metadata.image_offsets, ) - def forward(self, batch: ScheduleBatch, forward_mode: ForwardMode): + def forward( + self, batch: ScheduleBatch, forward_mode: ForwardMode + ) -> Tuple[SampleOutput, LogitsProcessorOutput]: if self.is_multimodal_model and forward_mode == ForwardMode.EXTEND: return self.forward_extend_multi_modal(batch) elif forward_mode == ForwardMode.DECODE: diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index b38b62fafd..9eb04dc263 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -31,20 +31,18 @@ ) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata LoraConfig = None @@ -383,17 +381,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index f6d6f6e1f9..c360106f97 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -64,6 +64,7 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -326,6 +327,7 @@ def __init__( self.config = config self.quant_config = quant_config self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() self.model = CohereModel(config, quant_config) @torch.no_grad() @@ -340,9 +342,11 @@ def forward( positions, input_metadata, ) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 39ac4aefa7..b3a76b56ae 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -45,6 +45,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -382,6 +383,7 @@ def __init__( padding_size=DEFAULT_VOCAB_PADDING_SIZE, ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -391,9 +393,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): expert_params_mapping = [ diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index 59fd1ec7ed..b939602c1b 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -46,6 +46,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -385,6 +386,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -394,9 +396,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 13dd477392..15ecf4bb66 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -45,6 +45,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -632,6 +633,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() def forward( self, @@ -640,9 +642,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index ae3b1b1948..5a6e5df37f 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -37,6 +37,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -287,6 +288,7 @@ def __init__( self.quant_config = quant_config self.model = GemmaModel(config, quant_config=quant_config) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -297,9 +299,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return (sample_output, logits_output) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 3223424d79..77ebd8564c 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -37,6 +37,7 @@ from sglang.srt.layers.layernorm import GemmaRMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -346,6 +347,7 @@ def __init__( self.quant_config = quant_config self.model = Gemma2Model(config, cache_config, quant_config) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -356,9 +358,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def get_attention_sliding_window_size(self): return get_attention_sliding_window_size(self.config) diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index 94b7f6153c..dc828f0142 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -35,6 +35,7 @@ from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -261,6 +262,7 @@ def __init__( if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -270,9 +272,11 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, input_metadata) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index daf6f25da1..3c2a2c65ea 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -46,6 +46,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -297,6 +298,7 @@ def __init__( self.model = Grok1Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() # Monkey patch _prepare_weights to load pre-sharded weights setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights) @@ -313,9 +315,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index f2947e991b..c0e4d19e12 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -40,6 +40,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -262,6 +263,7 @@ def __init__( self.model = InternLM2Model(config, quant_config) self.output = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -272,9 +274,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.output.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index fe75916a43..22751d9b67 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -39,8 +39,9 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm -from sglang.srt.layers.logits_processor import LogitProcessorOutput, LogitsProcessor +from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -302,6 +303,7 @@ def __init__( self.model = LlamaModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -310,11 +312,13 @@ def forward( positions: torch.Tensor, input_metadata: InputMetadata, input_embeds: torch.Tensor = None, - ) -> LogitProcessorOutput: + ) -> LogitsProcessorOutput: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def get_module_name(self, name): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index c5effbfc9c..03ab5e802c 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.layers.logits_processor import LogitProcessorOutput +from sglang.srt.layers.logits_processor import LogitsProcessorOutput from sglang.srt.model_executor.forward_batch_info import InputMetadata from sglang.srt.models.llama2 import LlamaModel @@ -65,7 +65,7 @@ def forward( (input_metadata.batch_size, self.config.classification_out_size) ).to(input_ids.device) - return LogitProcessorOutput( + return LogitsProcessorOutput( next_token_logits=scores, next_token_logprobs=scores, normalized_prompt_logprobs=scores, diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index 49ff1926f3..0028ae67a8 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -39,6 +39,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -297,6 +298,7 @@ def __init__( self.scale_width = self.config.hidden_size / self.config.dim_model_base self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -314,9 +316,11 @@ def forward( lm_head_weight = self.model.embed_tokens.weight else: lm_head_weight = self.lm_head.weight - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, lm_head_weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index d11f6c9519..ca38cb03ba 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -41,6 +41,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -299,6 +300,7 @@ def __init__( self.model = MixtralModel(config, quant_config=quant_config, prefix="model") self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() def forward( self, @@ -308,9 +310,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index b02e925c5a..97ac09ee62 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -45,6 +45,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -333,6 +334,7 @@ def __init__( self.model = MixtralModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -343,9 +345,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 93dae9585c..4958a81298 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -39,6 +39,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -251,6 +252,7 @@ def __init__( vocab_size = ((config.vocab_size + 63) // 64) * 64 self.lm_head = ParallelLMHead(vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -260,10 +262,11 @@ def forward( input_metadata: InputMetadata, ): hidden_states = self.transformer(input_ids, positions, input_metadata) - next_tokens = self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - return next_tokens + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index a0c54f6910..6bb5c0b906 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -38,8 +38,9 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor -from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType +from sglang.srt.layers.pooler import Pooler, PoolingType from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata Qwen2Config = None @@ -276,6 +277,7 @@ def __init__( self.model = Qwen2Model(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) @torch.no_grad() @@ -289,9 +291,11 @@ def forward( ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) if not get_embedding: - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output else: return self.pooler(hidden_states, input_metadata) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index d5c79a40f0..67b5a6ce66 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -35,10 +35,8 @@ ReplicatedLinear, RowParallelLinear, ) -from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -49,6 +47,7 @@ from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -366,6 +365,7 @@ def __init__( config.vocab_size, config.hidden_size, quant_config=quant_config ) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -376,20 +376,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) - - def compute_logits( - self, - input_ids: torch.Tensor, - hidden_states: torch.Tensor, - input_metadata: InputMetadata, - ) -> torch.Tensor: - logits = self.logits_processor( - input_ids, hidden_states, self.lm_head.weight, input_metadata - ) - return logits + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 9e10f12f2a..a3102baabd 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -40,6 +40,7 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler from sglang.srt.model_executor.forward_batch_info import InputMetadata @@ -249,6 +250,7 @@ def __init__( self.model = StableLMEpochModel(config, quant_config=quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() @torch.no_grad() def forward( @@ -259,9 +261,11 @@ def forward( input_embeds: torch.Tensor = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, input_metadata, input_embeds) - return self.logits_processor( + logits_output = self.logits_processor( input_ids, hidden_states, self.lm_head.weight, input_metadata ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py index bc70a9018e..7843f4bd32 100644 --- a/python/sglang/srt/sampling/sampling_batch_info.py +++ b/python/sglang/srt/sampling/sampling_batch_info.py @@ -21,10 +21,63 @@ class SamplingBatchInfo: top_ps: torch.Tensor = None top_ks: torch.Tensor = None min_ps: torch.Tensor = None - penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None + + # Dispatch in CUDA graph + need_min_p_sampling: bool = False + + # Bias Tensors logit_bias: torch.Tensor = None vocab_mask: torch.Tensor = None + # Penalizer + penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None + linear_penalties: torch.Tensor = None + scaling_penalties: torch.Tensor = None + + def has_bias(self): + return ( + self.logit_bias is not None + or self.vocab_mask is not None + or self.linear_penalties is not None + or self.scaling_penalties is not None + ) + + @classmethod + def dummy_one(cls, max_bs: int, vocab_size: int): + ret = cls(vocab_size=vocab_size) + ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda") + ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda") + ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda") + ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda") + return ret + + def __getitem__(self, key): + if isinstance(key, slice): + # NOTE: We do not use cuda graph when there is bias tensors + assert not self.has_bias() + return SamplingBatchInfo( + vocab_size=self.vocab_size, + temperatures=self.temperatures[key], + top_ps=self.top_ps[key], + top_ks=self.top_ks[key], + min_ps=self.min_ps[key], + need_min_p_sampling=self.need_min_p_sampling, + ) + else: + raise NotImplementedError + + def inplace_assign(self, bs: int, other: SamplingBatchInfo): + # NOTE: We do not use cuda graph when there is bias tensors + assert not self.has_bias() + + self.vocab_size = other.vocab_size + self.need_min_p_sampling = other.need_min_p_sampling + + self.temperatures[:bs] = other.temperatures + self.top_ps[:bs] = other.top_ps + self.top_ks[:bs] = other.top_ks + self.min_ps[:bs] = other.min_ps + @classmethod def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): device = "cuda" @@ -45,6 +98,7 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): ret.min_ps = torch.tensor( [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device ) + ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs) # Each penalizers will do nothing if they evaluate themselves as not required by looking at # the sampling_params of the requests (See {_is_required()} of each penalizers). So this @@ -72,6 +126,25 @@ def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int): return ret + def prepare_penalties(self): + self.scaling_penalties = None + self.linear_penalties = None + + for penalizer in self.penalizer_orchestrator.penalizers.values(): + if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer): + if penalizer.is_prepared(): + self.scaling_penalties = penalizer.cumulated_repetition_penalties + else: + if penalizer.is_prepared(): + if self.linear_penalties is None: + bs = self.penalizer_orchestrator.batch.batch_size() + self.linear_penalties = torch.zeros( + (bs, self.vocab_size), + dtype=torch.float32, + device="cuda", + ) + self.linear_penalties = penalizer.apply(self.linear_penalties) + def update_regex_vocab_mask(self, batch: ScheduleBatch): bs, reqs = batch.batch_size(), batch.reqs device = "cuda" diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index e69d699a7d..ac69ab875b 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -180,7 +180,7 @@ def __init__( tp_size=tp_size, dtype=get_dtype_str(torch_dtype), port=port, - mem_fraction_static=0.7, + mem_fraction_static=0.69, trust_remote_code=False, is_embedding=not self.is_generation, ) From a362340b33258eae0f48504be09659e2e9dcd035 Mon Sep 17 00:00:00 2001 From: lxww302 <68112258+lxww302@users.noreply.github.com> Date: Thu, 29 Aug 2024 23:43:41 -0700 Subject: [PATCH 116/118] fix: multimodal_config in monkey_patch_vllm_dummy_weight_loader (#1260) --- python/sglang/srt/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index b7bb657306..66a5679d75 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -407,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader(): DummyModelLoader, LoRAConfig, ModelConfig, - MultiModalConfig, ParallelConfig, SchedulerConfig, _initialize_model, @@ -422,7 +421,6 @@ def load_model( model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig, @@ -433,7 +431,6 @@ def load_model( model_config, self.load_config, lora_config, - multimodal_config, cache_config, ) From f414352ae6783dc20dc93e09be00ea62f4438931 Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Fri, 30 Aug 2024 14:45:40 +0800 Subject: [PATCH 117/118] Transpose mla weight offline (#1261) Co-authored-by: Yineng Zhang --- python/sglang/srt/models/deepseek_v2.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 15ecf4bb66..67d99d5124 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -417,12 +417,8 @@ def __init__( v_head_dim=self.kv_lora_rank, ) - kv_b_proj = self.kv_b_proj - w_kc, w_vc = kv_b_proj.weight.unflatten( - 0, (-1, qk_nope_head_dim + v_head_dim) - ).split([qk_nope_head_dim, v_head_dim], dim=1) - self.w_kc = w_kc - self.w_vc = w_vc + self.w_kc = None + self.w_vc = None def forward( self, @@ -464,7 +460,7 @@ def forward( ) torch.bmm( attn_output.transpose(0, 1), - self.w_vc.transpose(1, 2).contiguous(), + self.w_vc, out=attn_bmm_output.transpose(0, 1), ) @@ -715,5 +711,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ) weight_loader(param, loaded_weight) + if global_server_args_dict["enable_mla"]: + for layer_id in range(self.config.num_hidden_layers): + self_attn = self.model.layers[layer_id].self_attn + w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten( + 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim) + ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1) + self_attn.w_kc = w_kc.contiguous() + self_attn.w_vc = w_vc.transpose(1, 2).contiguous() + del self_attn.kv_b_proj + EntryClass = DeepseekV2ForCausalLM From b7f834101476209767b7c8a52f17aa86cad79f44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=A2=85=EA=B3=A4?= <149566442+Deepfocused@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:08:28 +0900 Subject: [PATCH 118/118] EXAONE 3.0 Model Support (#1258) Co-authored-by: Yineng Zhang --- python/sglang/srt/configs/__init__.py | 5 + python/sglang/srt/configs/exaone.py | 195 ++++++++++ python/sglang/srt/hf_transformers_utils.py | 12 +- python/sglang/srt/models/exaone.py | 399 +++++++++++++++++++++ 4 files changed, 609 insertions(+), 2 deletions(-) create mode 100644 python/sglang/srt/configs/__init__.py create mode 100644 python/sglang/srt/configs/exaone.py create mode 100644 python/sglang/srt/models/exaone.py diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py new file mode 100644 index 0000000000..9e74366709 --- /dev/null +++ b/python/sglang/srt/configs/__init__.py @@ -0,0 +1,5 @@ +from sglang.srt.configs.exaone import ExaoneConfig + +__all__ = [ + "ExaoneConfig", +] diff --git a/python/sglang/srt/configs/exaone.py b/python/sglang/srt/configs/exaone.py new file mode 100644 index 0000000000..7b0a2d290d --- /dev/null +++ b/python/sglang/srt/configs/exaone.py @@ -0,0 +1,195 @@ +# coding=utf-8 +# Copyright 2024 The LG AI Research EXAONE Lab. All rights reserved. +# Copyright 2024 The LG CNS AI Engineering Team. +# Copyright 2023-2024 SGLang Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" EXAONE model configuration """ +from typing import Any, Dict + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, Any] = {} + + +# ruff: noqa: E501 +class ExaoneConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to + instantiate a EXAONE model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Exaone + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 102400): + Vocabulary size of the EXAONE model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model. + Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of + :class:`~transformers.EXAONEModel`. + max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_size (:obj:`int`, `optional`, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + num_layers (:obj:`int`, `optional`, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (:obj:`int`, `optional`): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + intermediate_size (:obj:`int`, `optional`, defaults to `hidden_size * 4`): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"silu"`): + The non-linear activation function (function or string) in the decoder. + rope_theta (:obj:`float`, `optional`, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (:obj:`Dict`, `optional`): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (:obj:`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (:obj:`float`, `optional`): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (:obj:`int`, `optional`): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (:obj:`float`, `optional`): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (:obj:`float`, `optional`): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (:obj:`float`, `optional`): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (:obj:`List[float]`, `optional`): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (:obj:`List[float]`, `optional`): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (:obj:`float`, `optional`): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (:obj:`float`, `optional`): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + embed_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for the attention probabilities. + layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): + The epsilon used by the layer normalization layers. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if ``configs.is_decoder=True``. + bos_token_id (:obj:`int`, `optional`, defaults to 0): + Beginning of stream token id. + eos_token_id (:obj:`int`, `optional`, defaults to 2): + End of stream token id. + tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to tie weight embeddings + gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`): + If True, use gradient checkpointing to save memory at the expense of slower backward pass. + + Example:: + + >>> from transformers import EXAONEModel, ExaoneConfig + + >>> # Initializing a EXAONE configuration + >>> configuration = ExaoneConfig() + + >>> # Initializing a model from configuration + >>> model = EXAONEModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.configs + """ + + model_type = "exaone" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_hidden_layers": "num_layers"} + + def __init__( + self, + vocab_size=102400, + max_position_embeddings=2048, + hidden_size=2048, + num_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + intermediate_size=None, + activation_function="silu", + rope_theta=10000.0, + rope_scaling=None, + embed_dropout=0.0, + attention_dropout=0.0, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=0, + eos_token_id=2, + tie_word_embeddings=True, + **kwargs + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_layers = num_layers + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_layers + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + if intermediate_size: + self.intermediate_size = intermediate_size + else: + self.intermediate_size = hidden_size * 4 + self.activation_function = activation_function + self.embed_dropout = embed_dropout + self.attention_dropout = attention_dropout + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs + ) diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 2be4169140..7fce3b2401 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -15,6 +15,7 @@ """Utilities for Huggingface Transformers.""" +import contextlib import functools import json import os @@ -34,14 +35,21 @@ try: from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig + from sglang.srt.configs import ExaoneConfig + _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { ChatGLMConfig.model_type: ChatGLMConfig, DbrxConfig.model_type: DbrxConfig, + ExaoneConfig.model_type: ExaoneConfig, } except ImportError: # We want this file to run without vllm dependency _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {} +for name, cls in _CONFIG_REGISTRY.items(): + with contextlib.suppress(ValueError): + AutoConfig.register(name, cls) + from sglang.srt.utils import is_multimodal_model @@ -53,7 +61,7 @@ def download_from_hf(model_path: str): def get_config_json(model_path: str): - with open(os.path.join(model_path, "config.json")) as f: + with open(os.path.join(model_path, "configs.json")) as f: config = json.load(f) return config @@ -89,7 +97,7 @@ def get_config( def get_context_length(config): - """Get the context length of a model from a huggingface model config.""" + """Get the context length of a model from a huggingface model configs.""" rope_scaling = getattr(config, "rope_scaling", None) if rope_scaling: rope_scaling_factor = config.rope_scaling["factor"] diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py new file mode 100644 index 0000000000..4dcafed7ce --- /dev/null +++ b/python/sglang/srt/models/exaone.py @@ -0,0 +1,399 @@ +""" +Copyright 2024 The LGcns AI Engineering Team +Copyright 2023-2024 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +# Adapted from llama2.py +"""Inference-only Exaone model compatible with HuggingFace weights.""" + +from typing import Any, Dict, Iterable, Optional, Tuple + +import torch +from torch import nn +from vllm.config import CacheConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.layernorm import RMSNorm +from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput +from sglang.srt.layers.radix_attention import RadixAttention +from sglang.srt.layers.sampler import Sampler +from sglang.srt.model_executor.forward_batch_info import InputMetadata + + +class ExaoneGatedMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.c_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.c_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.c_proj(x) + return x + + +class ExaoneAttention(nn.Module): + def __init__( + self, + config, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + layer_id: int = 0, + rope_theta: float = 500000, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_is_neox_style: bool = True, + max_position_embeddings: int = 4096, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + self.head_dim = getattr( + config, "head_dim", self.hidden_size // self.total_num_heads + ) + self.rotary_dim = int( + self.head_dim * getattr(config, "partial_rotary_factor", 1) + ) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.out_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.rotary_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=rope_is_neox_style, + ) + self.attn = RadixAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + layer_id=layer_id, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, input_metadata) + output, _ = self.out_proj(attn_output) + return output + + +class ExaoneDecoderLayer(nn.Module): + def __init__( + self, + config, + layer_id: int = 0, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 500000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None + ): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings + ) + rope_is_neox_style = getattr(config, "rope_is_neox_style", True) + max_position_embeddings = getattr(config, "max_position_embeddings", 4096) + self.self_attn = ExaoneAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + layer_id=layer_id, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + rope_is_neox_style=rope_is_neox_style, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = ExaoneGatedMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.activation_function, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + rms_norm_eps = config.layer_norm_epsilon + self.ln_1 = RMSNorm(config.hidden_size, eps=rms_norm_eps) + self.ln_2 = RMSNorm(config.hidden_size, eps=rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + else: + hidden_states, residual = self.ln_1(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.ln_2(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class ExaoneModel(nn.Module): + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.wte = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.h = nn.ModuleList( + [ + ExaoneDecoderLayer( + config, i, quant_config=quant_config, prefix=f"model.h.{i}" + ) + for i in range(config.num_hidden_layers) + ] + ) + rms_norm_eps = config.layer_norm_epsilon + self.ln_f = RMSNorm(config.hidden_size, eps=rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + input_metadata: InputMetadata, + input_embeds: torch.Tensor = None, + ) -> torch.Tensor: + if input_embeds is None: + hidden_states = self.wte(input_ids) + else: + hidden_states = input_embeds + residual = None + for i in range(len(self.h)): + layer = self.h[i] + hidden_states, residual = layer( + positions, + hidden_states, + input_metadata, + residual, + ) + hidden_states, _ = self.ln_f(hidden_states, residual) + return hidden_states + + +class ExaoneForCausalLM(nn.Module): + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, + efficient_weight_load=False, + ) -> None: + super().__init__() + self.config = config + self.quant_config = quant_config + self.transformer = ExaoneModel(config, quant_config=quant_config) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.logits_processor = LogitsProcessor(config) + self.sampler = Sampler() + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + input_metadata: InputMetadata, + input_embeds: torch.Tensor = None, + ) -> LogitsProcessorOutput: + hidden_states = self.transformer( + input_ids, positions, input_metadata, input_embeds + ) + logits_output = self.logits_processor( + input_ids, hidden_states, self.lm_head.weight, input_metadata + ) + sample_output = self.sampler(logits_output, input_metadata.sampling_info) + return sample_output, logits_output + + def get_module_name(self, name): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id, num_shard) + ("qkv_proj", "q_proj", "q", 3), + ("qkv_proj", "k_proj", "k", 3), + ("qkv_proj", "v_proj", "v", 3), + ("gate_up_proj", "c_fc_0", 0, 2), + ("gate_up_proj", "c_fc_1", 1, 2), + ] + for param_name, weight_name, shard_id, num_shard in stacked_params_mapping: + if weight_name in name: + return ( + name.replace(weight_name, param_name)[: -len(".weight")], + num_shard, + ) + return name[: -len(".weight")], 1 + + def get_num_params(self): + params_dict = dict(self.named_parameters()) + return len(params_dict) + + def load_weights( + self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None + ): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "c_fc_0", 0), + ("gate_up_proj", "c_fc_1", 1), + ] + params_dict = dict(self.named_parameters()) + + def load_weights_per_param(name, loaded_weight): + if "rotary_emb.inv_freq" in name or "projector" in name: + return + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + return + if name.startswith("model.vision_tower") and name not in params_dict: + return + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + return + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + if name is None or loaded_weight is None: + for name, loaded_weight in weights: + name = name.replace("attn.attention", "self_attn") + load_weights_per_param(name, loaded_weight) + else: + name = name.replace("attn.attention", "self_attn") + load_weights_per_param(name, loaded_weight) + + +EntryClass = ExaoneForCausalLM

gb)djgzjXWF?m-Ud!sXoHsZV!AD`hfZ$-1-6jgXt`lW$ym^W|xgo zNW_C|#AyaFy*Y?gOEWFzNK4;QBfW`j34oR3VGfi;x&>*ChZEBdTQ`8aoP6qY&m zu#KxDoICjAQHPgfdOq{v@utsO@jP>6^d=CCucHxqOfNu!_A>>ynw%E7503p@;__iV zYgLRUd8u{AJ}nnt3V3fXn{oXIt6(8*=Y)UpYuP-9TTFtwdWAPL2jD*1>e zt4IKIt}=S_HN;^a!CY?Gd5f_DlL29Fvt0BrT6O5T-|h>dJI=JX#VKD~9$UQ@`vU6c z_+9|E(HZok`cswZK@c@^U*Aaqg$HeOivzzoTnbCgNE!**21hHnP=>rKBf;R|2xV|b z(5`s5k~G^objo5R;NqL^P*W^n1#YH`o zw@MkJtzs(6f?x=3_$}2^HLJw@#F#+ui1b||{KXA&bAJmz$`Rj1L~2i{HxyX+qdxyM z!dMi63|Cx6Vk`w2faMR|RGx_qRBX8;2LCg$H9lHwIyZgUzYbha7G_@3HKGH$6V zaG*|?1XG%CAXZv1eJ8^VPspOwwMN19TMr)^Fnc~#q8kU|K7+nFgc1axdux{a;}cK) z9VqQ9^px7%IBy*2-Wn}BYdg8I+C&_uub(c%P-RQy(kEeY2jAW6dD_bMMYHp2IkF(C&##6}(1)u|S3E#wlz{YW+6R#~k-J}b!d?vz4I&lZu#;2Sj?l9kZ z=W{C%-f)abo{jm|)>IRO1e2qDdX$uUu=`1#HYL6=n<+47Qkk?iD|+1g6#tOH=hPFl z#BDi^#nZf(+v_gu#&H$ET6&{Li%hD7l7J;4xWeVAZs2;Uuf#~iW965h+j=ne&O&Dd z!>dP!(jm?&5&ip+GpHP*5>6$s=u{U+JW*wG54BcwHeJ8!?yNO?AB`v^yr5seiuPQq zz2q}WQz~7X3suqvIw-c;sSt z{W3;Zr^3qG-*R$aqTqR4*~*6J)iwSfc?Qra0mZIM*zu<}#k#9Qd1en4swcebgfkSQ zH8K=pe{|p7FWu;SxQXPQhQ6;bDZngkcw{5Lu5$P20wd=1l9Nlq-AH3>fAd!s@}8aM zAo8ZwyKl`Mi3~Km#m}KK0gyj*W1^l^(JR1xrr+kd;gnb+BZkq?pW^yZ=lH}kY~?M^ zMsB0z9k6OiBED+=-bFL2kwbs}hoeXO6`FxUz!dM5h@Tt+xniRqa)|FSVzQiN>Em{^ zt>#1_NNIu8+tpyx-xi+06pJQgjP)^V z)=_t%i>g`)DEE3-{$Er3YD(oin zwNXi5fk0SNzNB~Z1CKQ@TS1)z_P7+=Tsw;2fu@B#M-_XFzS+V-_HYJk^H_5o!oJIt zrLj(-h@BD7mH2u^5Dlq z63R!hWI^yj-^i;KD_W;nDWA;-?FhMg-vn=1Y-w2!$D7}^zR;^W%|}b_f!2VqPQUF4 z5cD~%#b|S>g0iX1CWtrtye2XM?p12Zl^Pf^75{2-P z7lqum2TLl9ZEoWC`udt9g-E78=joQa4Gnug-jfie;ktCf6H4`#d!lN`6@wGiV)@I==_M7NV&83L!?A$ zwy#tQR7`vd_VhSCSr4n3U9lbCVQQj!k>^{p(>_s|so%R(?&Iyb53zeg(|eKr0?aj}tT^glX`e}%j>4=)h~ z1<(oSy!zkG`X5oTU*E5!sOtK`kn?Gi}> zxokgzdb9s1jngk_^4ZJ&s6^ghN|f}}`1JRdpVzC&zf#Mbys{|&q(s1gd@XrrO051U zjXoev@~Z6m@5kK~r!mRsR$0ueU;uB=-82mzll!L%{Js(ajs9?J=eME-%S4h+3Cz%I zZi@PfcFd0riX+v0X63iuQ%$UA)}h(uq9! zZ6k?K{62iiX+Ng+eEdAM?Ps+lPvK-C29&8%fDvzh3)nM*kTCj&6n_~Tm z%h`ZM_mX|$&p1*}4V#WA=1r3vcBRrZO6<=c{%sl0@&BQ-iNJnsC=kB3SOm#rw5`6t zC5~t*#HqhovtlYi!e{co8l&iU>L1u%L?>yA8AhtqZ0I^1??bd}?fl%36*PC`Xi8e) zWpj%~CTX7Oo#9N127Go#NpeZsHEWoXww@j7OhpEXY_)CeK*`vExcBZGWqveY{1e%`S^Arh8sca+gGl1D z{OBhU-d`@0P>pt@@TN$@)^#g7W0C?eW&>%mObInE(RXC;c6H8o&36fVF=%vag|m6N zlMC9sm5Vzli3j2zH0)IEE_)37`w03dhk53XL&rta{x@#sv1a8|NSi#Z^z7>g3)Xzz zCoz;GY1-F148Mgs*bcVv7xmW{E#?)IU#FXH_T=>-rNf?n zqL)n^I2?OU%Isp}R&>G~{={0L$^%<|urbB#lc)6<`>81E;uSL4Lrp8GmD1(;jf5J9 zW>`|L4u9|UshiYEOWO|fBlM2t@wX-})+@pr_B$mU=eE1h5&NXi)`NWqp(RnOjltrXtreat13r(r?k;zaYf%4AkvMvZ^^#p^# zWA!^id>l-|1%jcWIMZe(p%!-PY5Iq8 z{Pwd*AwdcNdPgwG;%slMFj{m*N*7yn(O$lB)R`<6nBZYQIzE^2m+PFCrqMrg3TM&g z=m_PDSevMjl!?iJ0I5E#9xj`asS7;xqHo@Yv+28o50!QKoLAD;Fg21_?=iED70-u; zP*;QjNRG_*?I#lcpYs_Y)uoI}cUnTwB_7df(r|^>EeZ?*E;bQ#RNnB?F>T_s^|2C6 z*;9e|@42ZSy#_NTao+kRR_STntEl={v#r4v1YecuKTPzXm=yV=hpL@0Wb|^T1TH{& z=Pk&lOYd76hV=R~oiMeM)7^(O2$5wzOi z$$E~ePL0tQ1PJbULHn1t7-U!iy}ah98wq+Ku6GH!NaN@f@uXuOzdwN;rc9)HzWabN zp0FSmw|@VCHBPNyCozqq&1$vPr@JWViG*yOWr!}&Nw9!~ObuM?EoO~}haP(zvvGZL zK4WYZu0327Yu$#GmASeldK)nf?pqvY>BBYD?Ul&{HX_CDby#}0&cX7H`_Es?OulSM z(@7xYa67Sk55^zjUg0jqEY%AvBteG0A19*PeoJa)lpbsoIB$&tlf) zp!gZk&bk_y8Fn9et2aKU+H0{VKAcXi1*nIkjf{stCBl1R05qbVE&hzTP4C##z zU;JZyR%+y+B3A`|H{)3~(L5Fym; zBTzk<05n9eYW={ghMFYEDjrADIT4E>*Xc8j&IedsZ-scSs)yqhT^H%k%9F6`Xg1~4S0zp00kdbK^VY?u z1z#8~-4t)EO_>p_@ax0paUOdrRkm?2g?;u*6J8jcybXUY>++t8F2Ixvm%u0!uLe4) zLs#Vwuxo~^uxpm?tR$Vhla-0D+%lV4dc^3zo2Cs_bHg@)0A3HTUkyAkX7ABmX#Qzq zyjXO=wtcQ!#95>HpT>CbuQ4t<(Fvr7bS)(Xv}dqlfXq#|fsNVW@E3*#S?b4>aQV1# z(@M`Hu*VadIGk*hU{=S;82iowd*l|G_g=*Z@%LoX5v|1Oa#`Vw%#TafCQB6Wvg$mY zWt7|HG3>|=PHHFx zL7hLJggA5X*0TuUritq>RUi##u-FiEz@fSM>}L*4hRl&s6QEBuke#~%@^FzM7DeKh zz$x|pHFv?FBP|AlS2km{{e)u9zsM`}TL1M&wLS(K)Y%bKf|cwwFq^R2OCLI?a-wAR z#}DPqZCFs_Yp?8ygEFMP4BJXJzD{TOa>drF1I!^?wp5xLc%waQR+8R2L~x+c&ExoR zdNRS`Wa*K>=S}Yj(8xQa{7Hj)y}j&b*!8suk8ETxM2?byT;iq;wSR7s|l}cSY(KR`|su@+EZzvKlCh zm7=E0W>Y;7hvn~lZELmX2W29>SKsQ0aJ{-tje_d3y*IZ7X=JI0zK&ZU_)IBa3(uDg zedfm!!_#DHN*|u53k%x((WFvsdgX~uX`$tg@6_3|?(pUhce=PFc?Y>3Y4veMyUKuU zC!aEWxCb!DAEe9I#?o}Li7xe};DL5_#K$P{4Dfd!9-HDZn!T0!Ck1MGkDLvGC{8}3ww^zLWfUkv<8 zJjX$v;(?7@P?^}pev`)tKbzgGu}K1b$z})uqlc7Bm-Qt37i|1>N?*v0I4o{ZszR@i zD1(@Xp17ZpWc;@B55bAj;7+(JZ+f~MVc8Ixz92|7$*hRXncis`ml}B2Y zR9DCaZf(R|dfN6Y%;&DA8!6rL{4}rrij| zbx*+1qN++VyD-q9sGD@JI*New*1*lnQsK?gY1%uWV$YVR0)}JJhLQcL0Uf@lQ|*{N ziZT0>-;28-SNy}Pw?m0nN5+VI;V%$gFWh&(c_G|Z1DZjIYv_W7cRP!asqi?wrTpe+ zqC7D91xCfhm$%^N7WaCRR&AeeapN4UK;QN`-Hi~2IRs-4N8H$K6qiyq-ask&fIze& zyeOl{i_Ha|i)!hk3YH6wxFLWB)rSl{4@0v7&1Sy${NdT(e zdWh*_kd1m@uNQ1?zcB$5aaimGGTd6Y$L@}#n*!;uQHh3o_C4;it@7Z>luvV=2>_Vf z{D}hCy3)-=!DH(Jc)11joRPTGk#gA|-!Cy#fBrnq?e#(VcQ{X*DDtjTeQocw`cU6B zPsf?UbNryd(08{zxk=@#kuexkpLzm5LJihSa2GD=+yMB_7MaLM>*~g-^Z#YeDT%b?M<8*`djilQf*IE|P8M4HyI~(W`dqA5O>9A&x1x z+9^6XqKX``9^9?Cft#*5^R*PIm=xITM00zSY%U#?f>F|o`4bYyGOU_K#l#^%O51}3 z`aqIn?kl?3^`r2HU5_mJ<&5Wfw3T-aOMG;)hjPJ3OCO)+Xy)1_oXB0bjHvnCf^>FX8;l|m8kh?^1J z<4d2Q{>l8%$Z!2NjzN5Kh_4ns@nK!jY{xmT&;0yT*jZbG@0IaT?s*;EKEr*sQmxS` zUTkNpNi6=%qkHqOoQv;zMl+Q~+g4={!Y56o69r%zts$qk52I&fBPXekJ?BICWHp#ut{>UCveZv+H}Y28z;|A;+nZ)-fxQEB^TqS&VON&&waZjMQ1s&# z*{EXFUhjs!0EDzuiFIy-p*<Ry-Tmlfv+qxNmxg1}QtnvPu{JfeME=9K9+FVd?&dmx1i zbl1;*unTbhQ&LR#rQh_`rtK72ezi8Lb$cw6a<0bjMSGtyT|${@cZ`7si!P1N>0U+c zNMV*^?Ttt_^_Kx8XOo1Y0254CA9$l<{w%+{%6U)@_LpmdsRu&+|(I%Xur?M8i;Eiwb5<)raixx22NO<0WZr%}wT6i^lQL zTezpX=!1WH(;XIj@Ql?%f}6R?eMbtara}&`+-e6LhP?R?$Y(eyKe*UQ5&2_Q$(<+F$|9Q`zP*KOLT8H`Kwq<^(+D>1u|7Gm2IDH`VOeA*y z*fjQJmHk#57lB8Y4MO3X*6WrKvl7|@{kjKs;fUUlYTT!fM%1NQddOSOyOa;?cFMAJ zASnm=vQxz&Zl%jQ`uiDtukd8mP$L#j5!%^fnfVOfZ^Xk3l%8X93pTwGEI(dmbc}DS zTeqk1p-e}HKjoLe8RL*g$TQ{fJi}wyCyL&9x{(5*sI0YuTmLJ|oKyk717-Hbu?-jg z={uNAi}e>~%7&ku_-MtpK&RZs%0KzR<00Ry)LNyCNYl>9M(@E4dy*rnC>w1~SsO>X zg9W4{&ok~O#`?+oJUoK1wV&4R(D`ic?PZGzO7*SZC?RRbML~c=;wdoLS{>=fuwAy1 z(OmpQ&YWS|9jSenJ$@6SyjYs!cafe_eu@8{9p^OLbXLhkrCSqn+&k5FY^p&L;h$)} zt5%VDnAEcMz_OMZCTNRBu718OK*?#Xq}S}Fq*nrh)^sS;d9Qyk?MalCsns&*i5E1E zqYN=G7EK`NB5mS&i(_Q_uY`j%GKs&btxDU6|$M! z)HAf$72QUD-TC8qIiv=O2KS$cwLLy|%k3c2N7#Nb?M;;LNqE7YsgMwUt0h=EWKA1~ z#7ZB71{PP@`gn<-WR`Ko_Qw8=d%$% zfmqk&2|TRU{y~RF9C;>e8^aPSps$d~%OXhvR;^Su?R*L%d4!ZzD>Mvpk*c(u@@PXkQyVIS^vjvbo!!~sD^?Q(9!@0h?Z z=usiGm#SaWG78vq7JHiMLxNOy%{?Ay3I0)T`rrEys zR#v1`aq2Xj3;a4ut*F-NFgG=x&+%2Y%leabj2C@AUVjOr$AUkJj!ZNgWx8DKfXXkn z+#45gvel-yynh(ole#nVVl@p?qc8w$3UJ%3JDM>$x`LuvYFD1Mnab!P|4RY?M4SOZ zhTo#hZ?c9uV1%kI=%!dWz4U8>6!d0P1yIwjOe}4BJQs4-9S&@H;thXh{1VQvl8@kp zt{Oe~g^o7N{uj|emDF(m9PHMQh#^!^UxN>JV)Pbm4!R5U9?=y)yAD!kn}Y-anB4Ss z>1^r<)&slI^*(~Bt|yg@+h5J_{Qa)KL!!UFh+g=;`a~f+Y@@wSXxbgxSWnI8KDbUu zDQKtE`k7+U1&s>%cvO0)Q1Blh+<#Bv(@(!P_GKXB_$6BuLq&GyvzBVB+F{@t-(G zBEcE}R8E>OX^Q&8CV0{(m*PA}>AUxXeJx{@OKAVp!pACUL7shS@2`{dbICme7=kOD z%;zyeA{Qji*p~n6T>yU*^3G@}IHDAc3%mV9g;(xPig4jyYgy75k>d?7eE7TNFma=q zgZ?m6#Zlt(TN6ZtuM2`+-9yrgaSguL{v&@^dG$K^QEJES2^N!ApY16rpEj8W1zctR z@{1n~lQmM>WDBzn9Hsc*51Zz@EtSZ&>R36l#(_^H7>th&TpE3D@}2&E_OYVKuD0N{ zQO5sW>VL35v{}CUkFpC2p9%vM+Pk4@Es{Ul5NQU!FxFQ88tP6l$FJJ`f*2b7Id87I z7L}ckzxg(l(AfQK5Cx-iv)V`M))(fwqI>Io_Rf0oDvDmdyUJlT_sus|Y>rcYA%KYP z%ip-Ie;e-Kj(o+8PActPo>_wsr@;jVRWmVjz|tx?#B99h=WV(xLBf})g8xLV`g-{F zUA3z=gfA8;3n{)nKcD9B(vTJbbknP~*|y>Hp^bk#YTtVR=VM?O^AEoJ-<96f1z4nJ zVQ(9uKXtk5O&Xe&OM%k=E_?pb|7+y`k39*zd10sI*P9n6-6gNyc|r@kdBHXOsTScc zUqKUj@mI3GE(j71MkwBz^1u8N%&HdrZzK3WjzpSCiCli|@~uOxvzxpM3CHs~?m)tM z=vottzW};yaHJ!khvSt)fIu(MJ!(f-A`$`bhbY6ix_#-)@N#hBd8W9f}H5M)ee z0U67@gIoo68f{p-X8F|%B9&K`%G}QCy&Trj+*xYrA5N;w_f#RWYDxFE0Z=eA+yC8p z6e|!E*}kE7nC(peIYnCj%T*AW_!(gzWh3YEwW?V@@-{x&-Q zTk@ZR*DeasWP1e+$ZhO+b{1{|L8tP?T^;!sy*KAAixU1(wqyp@E5r z>Y3H#m*fQtX&j5A^G&mT>J#n7pG-_*l@KotS& zkONfIc+uo{j(}g^&86XW&MK#^l=v4uqLc19A!}nLiq+0%FK605Gf`K70VLs0gjSJB zM`E3a+0SX}`Mw`bJ;WWcoHblFKl5+18_2Qg)zXLs;^;4JFGe&=hqsRyVx4>Ljf&*) zSj)WIo;PiP`KOx%x`YcTDvSY}UF6_ffw?+m=&FOa_Nitw%~Pfw#q%kv8$%_5p2PEw z;J&Z*wI>7D999OUwTqqJoxEPKXGM2!Fsnbsc)tWNL0W`jEF-+a#yO#dFV`z_|8RjB zd?szd=e82b>v@A(`03s%f+0&aJ(N|qybIXm8&P7>%NNb8Qu%c-PgVCr3?qz$YWK^f z;JaP%d^_=|e~e`ldGL-%iPPyOrzhOQpQr|qW(k=D?5eCHt$M0o7Ta2+H$DssW2#>Y zegllfw~-<%kx$ZIw`^}8Sr>HfE|-|@uT_<=oZ^n88ed*A{mxEiw)8WJ-gexTWrIx^V$(6O+S`Tq6_gJj6sHj<~#^tdNs z8-nlyw$HF0W?=7yv-=oWsFQa)^LgVVtc`MGiWp2y%qQ z-J5djs_M2bsHaVGS)r+M*^JsG^kR?0<2MqauE}*|jQ0EW`U-HriFk#Hvteg6Gr*YX zs574OOPhDcvtGJ77mb~QPV33|e?qS-5))<#>$PZF6Q z>BHN z?gRlt5d6X{5bLj3CJ49yn^4Vn3E4r8RG{z}! z2Qp!bS-`Fb@!*B%=rZV;=<;Ayj?Gj(DjygS<0{#vS3EZ7+xtyK(NSsA;hhvvwFqDx z)bV2Vlz4!n`&WqcO{CI1D2h#=R)y-F$3iP?Y%!mSB>cLi3a{NJW zsjjV_izvHE$LYZ@pxRZ(529Nk0K7M2WCGxZ0n!0 zjqpz8ybpl;uGLMZuENr8Kd1dzp8JVQ_V>yn7XhCO>#mHX@m zHY``0L)Ls}fQc3WUyZP$SUUNYL5H;^WjJmHD#}jn5@S?IcR#xXGG3`24!^FQCo5a@ zT4KJT(LWp{|6tMm1~uQ)D`bqJks0sx&j~F_@8aW*c8eSP3c8zS3r4=tA$3c?XgxP+ zZ#Ha|isvm_XN+V~)~j}2H`QIex&m6p5$k6-%wHANe|z|A@?s{n#Zx8@P=Iq7C_Kb4 zD3aYz;5JDa%-xEUgY%2-ujyF8p329maoDaYjg`$(A08L9FfwN*AmINWVQ(20*S2&E z2X{#D;0YGo-9vz2!QF#h^y$@{KKef!T%tH1XlMWyggCSxbl znza#+0(p-v^~H{SjN8GCiz;N+tD|YbdvcI->LwXza!UF@UE(d3$Lw1p5v)~x)y#+r zY;~qn7z-X_fnEANc@*u|W?-{`<+-gS&{k6xqGb6n?<;<+zvi3BU#I>=u+Du;2G4Zd z9>-*E{dQaXDHWaguC?7GmR8mZN(SK>_ip&K_O?Sw?9Tgm(W4Ort!wv)%H%bAs?HGf zVE5h76Cn{f7ExJ_8B6KpEI7CUy%YquFCi^qfu>5d^kS(#Fs8gDjt3AEhHv#!%i1+x zie)l|$N8qKcGvYXq&`w#NThy)!+dblUhBWAY(N;zrdG_Cvjn6C8t(50sODO&qZrwk zOl$AKt+reA0H5S{mwG&>8_5-7aCJ!Eo+1jA+-@Z?k_Ehq8Z%oz03gopIANwbjy|4+ zp(_NVuB&!XFpOX*adP|B*~jj$vupFUR{TZJOT`P4-Y5-5h5>utF7M8rv|6nyZFJiu ze?`7LKJjUD5<>(K>NqM;Zkc?6PReWXZC4g9SAhsuK2x~V<8@x(U~;u0QQOKm8TdsP zVn0A_`0f*!KVC3GVR^Hxqg+2s?3!Y>8~#%0-N&9t>IkCx*lckr>67xNWS{wQ+3+WG zQUuo;K*QeZpg9xgW+P5<$4xQkXssFTasuL&G!=yxE8qt~q|oyzFGR za4pIDM(x@nSI&7_=D}(mO+uh)Gsh*au++Qyb=Mad z@%$;{5LKiN0NwhpqHYc*>$UAqi9g<7kQZLN!g-u+p(+>lA3PTEeOk>z;WfMB`5Zqypwj6bwE>9~FQ5<0$S#bqVCN6;^H z)nuHa={xVZH!jf}A>`=;(R^qc_MI;Eomcx)AqJ>S1Dt%|0TB1jJP7)dyUM4SFv1_% z9jelVLE2a3r+6!YA+~w~LpP8oyP7Uj1D`P}OpvioFOi5*&O5g2VaW2REK6cc_0Djg z*#sySAM_j)vB$o*U7Cu`^MRxG-J5y2IuSi!%DH{L#yQZ^-5>Ek$}$+d-k;* z>ER~rx?>8Ug;UKFTyHbS%+jXWFNmW2E?+L)G6SEEYL6o?fybIB+IpsJ!L+8l$$&P} znVEdj->9pn_mB_m@UT?dE#7yADPg@EEtUA%C%LE$qTA>x&+mMvdz|{X45%N_0DCD| z#l{iyqnacD1_v3PdQAl`{due{y|Rw*%w7a>D+3yVkX7ASt?-P|Lnq;R?#k84a;w7r zl$W>+F3W4*r;G9$n`KSOd(ICu@)>pnmKSFQRhC^cbh>ZaZ)4^*wIh1g2tC#p4oCes zEF^cQ_IyBheEEAV$5ro1Cp>;f?f}YY5d&eTr54&;$tV{y+ik8Hxta5m;0PMH_H>Vm z{JdL?v*yQr6~DPE^EeDM?N4FCm&!HoUue}oDv98Jmr0233c*ZWNq~|S{}x%WS*`+9 zdXmO+o;rO`Puq?rs={D0mwE+dI%^Z(!-VG?hRs)8zNlf*6&n;)-Bqz7zGTv|RF$?qu6yGNu8DvGTpt+@ zr!%~`OWwvFj(|Apc1? zvhDngQvbKXVc-({!k(a&=4(HvCjCJW2un5?ea}X;CdZ-Iu*#tJiC0QGjW@<3WJ(Mp zbl;I%e6q#5gG2II%B2y2LUB#MtE?1M2v^C%JhkM*k`c>seAPSBP0p6z)rz+t)!Ce; zZRqUohDk2(%Ez?>~RTY+KH!4n-+{&t#X*0;mbF{&W2g>z^BfoS1JqYk_3CUAFfcQ?c!Ez-Qa7~A^c(5VaDHCbYu?{Hia=9$ zx5PWbwn!)B%DMXvfHrLT9<4&XTI#^mwpT5nJviUE8xSOGdKW$?CNt-I+Tr_aJP-*h z9kT1UJnB{oZT1&8?HOKr+&;VNH1nQ5PZba0q9AH_CE`mC9*=lLyxflM2n$9Idd*MO&{PHs2M zcHSM;U++PC^i6zfzRyIzGYWM+8rQ0sTX5PLPA))_>R+c`=D6~L98;FcS6szi*~-^F zVO(S$)NZe3N73HA{{rMb(Ge4F(m$0g!g*O5=p&OpYj0-J+^+i z$e66j(xzIOEAT4pDt)Un5J^EtXy6Z-vl#v;`pDcij2p<3JmcZ+ihZfKm1b+8)k2#y zZM!amzt&o2McANkjZ(^VBY4Phf6$VS5n*YV_>&2MUCAOGNx6;NiEjceqU$_sHS%)y z!c?KeQzNwpZPwb8O%Myd;D4b3579}TxQ69?1PC%Nootgw^RYsov(4#YFNU>J7o2!At zbOGGFH}lJ!$M?U<1)aDz;3HbQBtikZ084yO4%LUWSu?|ZmrB*-s$mZf)4?~%)h4l1 z#WVehr3;7dM=Xu;mX4;$9(M||+j7ZNA6j@_XD2elcE}-EMe--pU@fWGUH+v@V(qq% zQ$0m|4)>oqAG4CFWa1Ue^jev(WkzqRJFF%pl(vvY)aGl67ZtJN{c>@^hUi>V(iJI7TD|I_^6!npdW9qGa(l z9`PLzo(}ABKK}BIH;pb-qw?+{R_@L^a=RF#Rbr8jZvW(gv-eu za__8>AfuE-r&Fo|m#|x&Yz?$CCNLzpATmP=v5p#U>kXv7m>JQ?CaUUY5Ti_QTjJFB zL}hZO^JOM|AYGV6yS;-PL$?NP18PdOxnCTesKON-3ER<$V3OL-)SC`)-=4HHIR{y?^VE5am#0}=J;gqmySVOOe!-vEY*A zgDnlWS}e7BYGf7bC~NBd@BXW*xq6GkYpuO7p7=tC^GOld-A_v_ckKX(mBgzvrOeY< zX_VhCAE6qioVyrk zqlw#P!69LM)hy?_o!2+9wd!;2FSqm74l($iS2zL2-4 z3^w$cYla$lb!Ko`XY?VH+M7>3?l0Rf%CwPBO4ZqD+ujX?ncc-hrRHmn4$F!RBPKwy zsob?5uXnsA-Wg*PyH7n}ePEree2d|3MFYkNPYGAT2(TI(R1C<*q+Wg@N>PGRten`S2=NZEv;23}6cG0Xx_vT;7>CwPVxCgV_L?il>t{HCwBE(E<1{=|t%X@W zqL_hqVU6=mUIZ!GsPE|0Y%|s=xTqAPVN3zP*+)kR? zdbIg;N$psX1ezdZbK||Ne3wq#N;k)dU?Fq>>y{C89`D9>|I5hzuJFv0Y*f6+YNmRm zHS{b>BnK93Txq<+?0wd)VP%y&HoQURzRM=(dF#;v9VYhFX+f=923^DTDt?i%!OiNJ zdu0d!pZvF6TIR)`D}iN)21hO?KEoXy&q$v+w`$J0c2&I6tXQR`B4z0xZM;8UzPahD zA1xDS`T3LcCts;{1KD~VNoF*1UlWi9vT66Y&e(TkTYVU>Vs}{1IRBavc-;7X!)=c>rgMOBnj!gMsl>ZN`ob02C#{tm&E?7bKas;ZQ~ zhX~V;QR_0{@3I_Fq^=YV)qwkW#Pv?xtY-*sL}5@B{b2v?qq!;teLi(8=E=iMnV9g2 z-E;{Z_IgyIjBW;lnRINlDzZ}zs}QwP;sSS%aF5fOv5B?-T?E6+q73a%pPqZG2KA{z z`1pts-h6UJnQTH=>KUMQ>7AHZvW^)LmM%D1xK>`|J6FHIykLJ<7~5F&*cp-uh|%pp z4ueze$+ijee_q(PKa_FEkP4W5Xs8Ovj`WE%?`g@8gshLu&GB{fsLeGEs%Wc4_>UIF zZu0)yxc}p$Re_)Pa;xn^q@(>X7a@K6MC-R$@C}>tLve{AI6O6h`XT~!l@ueqaC zw1&$uZT(!oB#a-3s2_B7WL)XC0^==o2XkLWc4Z3M++nt3H-K#vcsmKYyVZ1dAT3{)#3f=TGMLzu)fA>v(SHxYALJib5!dNcdj! z6^CoSr~{1C8HR4bKmO7Ww%|2|m|`Y(Co2zHz^g?mgiG`S_C+n<5~f$=bzO}W$0Gqk zU7pik&;4NW@C<=WL9JQOZ&Sw&MKZ8vgt&1YLuj^)cTz*79bjEq)cgmR=R! z5Cz+fV72wcCK-p~-}&kk3;lCcIl0_+ic>0gFkEY zLmNmt@P2y(ebmVmKX6nz%j!1b4@SBnGN@FDZ~;0|ymX&c10BJXB*P-&)3H~(+_ zfnqwmTeEdYSG{{Lt(R;*>6nvOMoTXaM%iwQ5fwL}7{FYRbct-Qa&gOCAnIu1MF{Ho zKt+O3Jfdu0f^@KhLf@IU%i)|}>-phXR?#^g8NHv2C2F)=&T)C>i~Qv^R;~*TzV}6_ z8zOuaWy5~i6}y_^LI&@nXJ+4m+qPBeOk^e1qJvT|MrvvQzfNGwnhTJ1jv z?=aL3Y4|Jp!VQY6UVX6|?-2x-9ouVtLtp53CR~93;cYbLB117@UcvA76)%xb`i)>L z7P$*k@Mw-M=Y)h2>m*y(o&0OYi^#57#%K9bX3L_qU5N~b?W6EUaJtsDs;KI3kLGCFks@v!Z?2QS1 z=n+=rr7V<|IxRd&8;2Xjhamxs*VC9-R5RP7vG=Jd=lO+ACSz5)Z!4VZ|KWfvu zSa~`0^Qc<(^|YLXcbi6k>oP2HSUImkmc;7{;AxIR)PleFW4dtmVRK*dxChUms~#xErXRz z)A7BXRn5cq73`<@M)P)4!jE8$G%^bo)bZ5HYN&BLvS2HJ)zNnpu}femVLww(E-Q-S zB)t)g$u}$6f%8halhZ^{`mUwcp#~ByOS%n&#RYr%6OlgWN|8hU68d@Y<-Tp4=7&12mJ_!{<4v`4kHgX%r>~Mc zwvlppB()2Rf#Ec{zdgI>u;PrdAX5|>$ro#8l+HTP?Twc`*%dhx>g1G${VAvkDHnt$ za^Jv?bK+gR{8`p=rEL-Z6|?=YFnvdpnJi|;BdZdE7>2u@9H>Td<*E`)VJ;Jlp#X{w zt&zL7W;WI2Dga}=5KDePg62PT{J71svd9($Hq1^dw)A)S_Uo+uBI{kHeq@GJ8ef_1xGJDw<8p-%8Vzt39$Za^E&K&c{uo!& zJp6O8oA!z}OZvxEP0blN48ZYawe};yBOOf@C&Od@@=*PRJm(xnTP-)sP}ahZf;E#b zI(sFt!8QzEVbl1tl1qxt`TBQ?mR#{JKMSgIaH?eH7r6vG1}3hRZEM?`0gZVll`B zpQrP?h_1B3xjetT7!mowRKFPdlA}%1(D0eQhv{oXEV9T}uj^tz3wV6yd(45P@*l@b zUL*zd5UpdCV~ykH8{NWAA3pljiB<1hCYNVP5fM3q4g3CZ6<)|Gg+J9>SnJvbUL1#Y zJvxor0@D4{6aH#*yvM=(?^UT`Gu(N^3`T^+DcM>K2!+1^@hE;`lN_}Xt1k?_IeHX_5=LxHu89@t>t|NG62FW-jByqQ=8y@U>GKutQ$mN|q z%Y6=9YGm0K1T7p@GPRSma(j!qS9YMAPorL{nYh{Tg4P=HymH3v?RE@jNoET|tdZwf z+=KHd*Ts8&ZT%^HhyV|;Jw}bWbAaY${zoeHIz~v;?=0|dQX_w>*Az|xGS9|bFqM)i zRoSIX?Z(#S@A}!cxE4mNB1cK*STk^tCrn@587Y}sW%utU+RzT{>?=3XXYJE9{>*y) zCgZBF0a5L1eT66pd=&dLcjvv(i`kl!UVrMb91h6{(mJvibJMN+7mWrP!VXR!C@_yHh;*t zZ^Ml47v59RzSSisCzs2(&)_KLWIH3o#=#i_D|KPRDD^!9Orfbd?^uc)6wGEUx$g&_ zvyl|@M?k{FMZi^2QE@ob#D<0K{{VGcRMB_k4tOgr3;z!2)Vv$!#u}6F>0}-ua6|dA z-Zzx?G6%aOp5|o^srKCg`d9~!sk0s?g9}?z*~_JmaIAY5RITkR9-V~zk6D<(GB2;L z&63EGT}U;L#g7Z#iE@e}h%1@!emvtu{F1~q-NL{Dt=R0(At`58)w>gwmsv>)O^ArOcOl9MEX2-VI{VFiWA!}m zDG$~apB5$Q-={<_#*>tQ9yB=HTWV1g@u!++NC$3TNlKb_`&sZQF3SHI?f}{E0W7-$ z_(5^ZyrfC63>7{{vrxB3;^jxxFM72jsWf8hq!xN*`I*gyWO7FVhB~?ZyG8KWfhpW& ztZiUUrT{hW(Jy96hYFC&rtCH>R0UO~c#6VowP&c;g(dQMgPaDtDx64#lpCD0d3VLi z4Y5;&!(p^ zsF!z(fUU;UeL8=-XKz*;Nf&Ojvg^4&Zzp}N!SXUonWnzRuwdiZB=a(+tvj{*W2f87 z!|LK6^Adf;N=GC8oRBhu+`NAZ{^V^JSUI)KTQgn`YT#$3@p^=$TyF zO$}<~ueI{_IzCL63$5Rt!2N=&kbf(ZXLDUZs6>RH8fJjoq%fyV)JY9VkF1Ju!6dW( zD6#kl+s=%$(29l0HrcFnT1Ghiv$-xu|{`TNqQNhI1iz6zNXwlme*~1Y8qp zNAmLJ0egx8+zgfxv;AXUXRO~}c7*vpo%K{&?WJew~ z;g-_p>dXsEk$Z5%r>uGzP!fKG3*(l}iSoqF$4|&)lB4e*<@^l3XRTdw&Oq(Rut0>c z70H_>rZ7jrzgAsUIpVp-47r8n3IV5JmH`3Kcq%P>+sCSX-W-|hAnUslHJ)@FcB>>^ z>VoilBzfx7{R>|VL*?(EzTYlt8<-dG%c&+%87p?G_IT$go6QKeWRPjx5lxyO1ijpM zIz#nAWbUZ~(!VXV$z61%xc<7!I$b*(8*6jd0tVS)btXLx7xFzF^;J$PD$Rst5WR=05^kQ(?0BrA3!S<}7UQyLswZg?8ZB}g`YKQ8WbVg?AptkM# zoq|-T!fq19>>yW7U)hk#+rAMG+kMUC9hL3#Noq$mE{TH{6FyZW+R_#BkVaoB?;dVd z6p5nq%oMcAx}4pMrym>mngq06&M~NV`Y*DXB}uEeh^Hg8?)Xb&l_9hF4iT!Cs`766 zRf?suUn6VFM?T-u14@5ZnN*i1MNm!m^%e>99R*ayEK~#;$?2j`r~AT_nZ9@7*n?U7N&tUR}c;QBRS%e#Uc!&ALQlLwu(-PbJRu8W+6T8tMYxbA~ z72FQF0eVlL>jm6A6I^*rAxMbo;`|EK4@^)1X$pfEx@MmD2V;&6`+?j0hvPjYOq&a` zEcC4~_X~}6Q!(uTna+e67L5oM!BF~x#=^#mZLL*e$8&OhzNF;>nXTWp`Zu5)90A)j zXd?$R*tNgsXTG*3Xl^+90y%+E86|BWyFs5~w%$BfROopYyM!Zk;kL=gi1?CP@0@(e zxRNn%ym3{hV0;^gLQF6_JN0ud@asZEa+em=lqT-J`5JF0x$j;@m{rA^hy{8H?Fk{_ zUff>aQlb9bi)0>?%(hv-I(u{fECnz};Z2=g4%uUE;Y|Bxn-fnAg zC10LlCoLQM`BH;J91!Y4($aMA?+#>~_tt%EhXccy-21=Q&P6D`yvDP7B_`P-(w5C^ z-F)7)9UT(im|*+Lf9uKWeP%HWdM8giE+|*xgU{hl-8_5`Cy_B!RZAH1>T3ep>sQw* zEq$9)?AK-zslW^|&gSnt#@~uOWt@c!55^BFDagyrAo3uOU4-)A^?GN2PZ(n=&Loo} z-LYtU<--b1o=VHH8lKUG9KFKBO{i*)%pY-B+#&};0jeOukb<^9+-w5u)VtLOx0N4{ zLcVJO7bC^^8n{K<)&NV2jJH6O+TY5-p#}L*^;SiBgv%OpYLR7^W^KAaol{ArDX~6$ z2&kdb2iG*Rs3B*5$Ch={){u*~z#8~#kdY#*iXaYT&Y3!bg!|JPpy4B;QRf9(1}kq; zR;EgE+g=T*Za}u3o{5)4V0QmZhE9C+CO5tY;M1dEw_2Dy1k1D!D{wcvW=4d%h9`og zQ5F#~_LKNUc5@`!ce_lE756p@b&qZ9m%{687Yu=1NH)HKl-(rSDPwf5dgtS2@4|CJ zLh$xWNEcYi4a_*%ct--HS`y{@JJ}u=`JTXOwy{CzE91u!e?>ipie#f$`&iX1 z`CFQoZVLQg{jqx1V%eY&@kG)*AeVKkf`QV4?%OmY#Lv-lnmV{h!5BWdn;D-B43nlh-UlSP{ea z`EFdh*tt(^ajQk0Chw|+OO4LCvP*m%4c8MX!3%Z50sF-%XZu^0E^0a?S_C`yJh#2` za2ADieY!a?R&6SL3irV|BJh`A7|z7Qe}0XC#pUJl$2?kCD`pYt25tOGn->*K`O`8@ zlYkAfv>G06qTNAeI+&(rrN8(p^Gp?p?O1hH2tVst^@e!*C1FtAnH1UgHzC>PZN~Z^ zFz*8cqEnCbB$vWi+KSi(XxBQCdXkVmWq^iWd5u8ac%6a}mp;Nl_{~nH{4nzqbOr4j z+lF=cOAhRbd{AMvcoK_2&~ht_@{Z=SX5eTv1`XrKdx^Vmr8-Z);^|I|r%PC()6*lO zqn~?2kFIYXp&r9*1KQEh1Ss=z6&mb|{bJ>(Qrf^cpbA(vq7zdX@QI$8cEywKdtV#5 zhFN@7fkj+xTeDKjbo2^rei0drSVlFp6A1lqb|lIHn# z$#ow_jP}W2_^HFquir$%n{=DIAJQ7{6ld*=tk_G8AbED#RlX9TBqv_c*%Ca;2oe}Q zI!1Hot9~$2-Xe4zSqWY=QJY`GO@X&L;^Vq&#w_@BEg1i&Df-EAE)46(X%1E_Q zHGXK+0v287fqY!Y&cDhHW6240rjFDcs<8PPRpJB03l^$kZ}9G&ZN=!_>0lShuqg41$`D$)xEjBgLMOogIjeDUHhvkvTrMF z4cag@X)@lp=!lGmV=_IAwFPCOhMKLX+YgN3LY%8Stx9y&Xecgz-sfc8HQFV8ZBXvN z7M1D3Hg&?F|FyHAD9@j6I_57uv(chu$CZ_STz9gQUNJy5rO*8oD%z`?T_(@YPi?V3 zFLSUn%VKKtUAGMWy{t96n}wJw465z2frjNqug74g@EhL!;&7RRPFwTTj+RV(qg^hI z%yHY)hbeMRk7dx=rs@YOYNzt1bH_!;13k@ZSJIqZV#ECqd*itNlFYNM8TZREq|sJu zg1r3vwc_&j?nwm^w>DoN;Un?SK|yPYFWlD0LQZ<{9`3-`nSzX3O(dCMpPooOP_<&! zy+P`wU}E__MtK|eoJZq1nXvsaPpE?2PrckUy?WE)$?PWdGNCrd7=4ct`!_TXIUk zXfOYJp~T%OQrQj_GHk+#2;0)ATUC&z{+uXyBdw%aj^J6QRCIE}ci7x&$X2~@0p9*}X&8@z?@I#xPr{FnUa9mBWMN}&59FR#yTQc4 zE0|&30e*hZgtm``po?}oZJQK3sF-vq6Ffp3ID52hgRusRew*>QJ4rf-x>H#83LY5J z(miY%AnEt8^`NPCIs&~9>^G`TI`}q2e)a{5gh0z{DGmYZ z0MX*W;iZ4F&Wa7nlPfa`?vJ;;C05J93(QWq-i$H-E*gOu+Ru=f5gYEwCjXoSKeFVK zv)tgFl;Pw7-0&x0 zw$YL^zL9&}LW+~&j?07IJpA%`0yH7~UJ@gBY~y};AB~Rh6awiUWXaFbsb?k>@T6My zxfU;Xml(lD$}uKM`5DkeJw3q26mN*a+)R(aS92NRPH?amNJAU&!80^t zttryv8`h#y+E4s8Rd1qSuUa=;>Wf&VB10|~C?B-mSjmB(Rtc#Tnqp&SV z<%41k|57`=I0z)io1$N^I8c>$(?MGV){zLz*}~Kp7kJOAN|s2KN#u#tVuR`qxKV{q zh~nR87Rm8<{VXPau~B1s0-^yftg5cK;KAW1At?6t6E7ODU4(9|zKU3`NVmBrL#WC4 z>5*4Y7z%B?pupn(6Q~mPe+`pP*ThGPu-^%UTg6p5H1NBFloiH(aeEYe#Nf}bbM!%a z`jy)&3H`3hrmwerHrqBIH-!hR5xB|mRKt~%m4%T{Rm9j6oVC(Zbu)Dfbjx+?b$vZb zM?a4GizJIrZ`ubBrh3$IH@dMsM?(qzib0HEmD=GC=U`V)MQu4TKCU+HtyE|~11Odp zIqP+o=`|avZB3CCnRLbJ>{?)jPHCv!ye<+pTtI-e?kClNEXO5mCy3HDOM5j>gF$e<#QntdsG7v;Zh(S&+oq z_>eDaf0zhfaA+(XO5)vfQsx5f;bfpa+|Qk?GMQM&ZUi^gk^|#P_<*XWSW}fd6}~~m zO$B4*75QR%gU0UYFFoa3FN?T_>MQi<1^mXL!? zaC7O5-zT+{me;l?Z4c2Nl3QaHMsR{$MuK++Qv_~>xAK{fD;Ib#zk>e0L=N=S&?eK~ z8zE3og}!n6GX3lJ9YaS}vrGk>5Raw^y{C=*9t(_^P9kb}5z=1-8#F=pCm}+iD<9e) z57!Y>0}CJErLT_!vUctd=eKl8-wX{5OhAz^TgE-YjXOWq?sn(Pr}A=l+HkLlY>-b= zJFU21=skE}TEMn_J4?U}MTCdvg*y8G9YFd`DG3oR1-Kko+WH88&E9<~tTKDO!v^7C z`UT$fQ{@Z2;iJ&wrRLW_fppp9+89Katfe(uR3$NTGzl}1#Pl=j0{?Z<2hz0SJA8Yj ztDa7?>Re#w=FRCt-_736_iq;uQK6QT$Qt?%eDn3zVIFAs#n`K6i(Kcfpd0Vn;oqzF z?=s;j*{nnYf>Qn27NWnt#0!GG`tdfL%!wTJ{DMs;q3j4a{(QO&3TJ%c@w~4nttq$f zZe_tWOJ7xQsa(9rmGHVFH*#rsCvT(Y?C@t&bZl$@!yTkkP2*|`O6Fk$Yc`Y;Xs6e_ znvj&yg@*Hl7S|W5w7)o@QQ}B8g7Ejl!UdDT2Jil|MP+cLtM=9P6VtdV?|pUnufDMP zd&6agFxJbIVuig!vnNB%i>eQn;S-?K`6*eY0z1;QiTr9|HFtMH8+BqT%1IS8wF@~U z_?rWCWd#t)!o$OZi-&INK+7;F$E=M_232XuT_pett{RNSS2>57-#pBW8QGpHx6I1J=iiwE{*w9R}@gXnJxJ+qgIEbDS(mUopBmWO= zcTs<J3Ni00JtzBz^^+9W#~yCv-pu!Jw$8x7jet~$?N=Y?DIorrRZOQ`~M%aV%9V&z)-U`&|W6| z(_;8{A`DR3HY{U?HzOB;50dQY5qX9Pw^1{-<+A1i%m|l4R>df$tUhxUhq6)a=1dFm zD&AkF^k4PxNmLjS5s?~;G4wN(F96N7`0D!dmo>}U=o+Q_MX6iSL+v>%ov8m;6xA?4 zFuZuz0oOZMPgHj0sO(X@@>#tKp1;P)i-0akc@HOk3ALK9%timoZ~8an!c%+X&p{z; z-*+;_+I^C`Oz36&=4-uJ$h^E^AMdPF+dcya&adBJcz*1QH4PPT-;HN3{$N6i4)iS- z>TG-FU3}vS8Tk2$iZvdTKQ*fDvY!P)K z01xh5Co~qc*Of>qk2_rM33Pb4FSu6jF4uDSXjPGP*|(7KtO%mJ985~QeG7Ya64k{- zc{~XUL2pBJizM-P3xXsvl5(Law6q%ly=mi4lGoh4kq2-fpPY?F6 zL_M5uJ+=8`W8)m4PZ46?-Ypl4WoR)juPcp~WEdY@5F{g-W?Tq-ZXSt$c6YBmB9Ti0 z{R%+rkjQNK9ua6eb!8Qo|A@&wbH9ms5>GOcNffoEF!3)z;R+ z@uwB1sD}X9Sb0FG{OL+TrYN9NI%%$o2%L0?V=#E5jU?)JG8Y2!!rC2J$*(JaGS1aM z2K7m}KhzS$q^5=dO7fy@ry^&KPwVSp%%f0ZZW}*!CY{wOjZ7&SZF=BzY=2{E>DkL2 zv(~5Hv0P2a0@3F7);{Fxqs4N|OCqkXVcTK9IpRJFjsz_1qCOxrCr?H-lO zc`iF3%h6EsyV#uQ?f?lXK$b0%=v(A)I`}hN_W!!;0&!7xLEn1^7%~rMSWi9Y#{OMh z@D6(G;dq!}hTSe?=Hv3B7JaTG;k`HLMdmj}D}%6Z<^8LP%F2q0(PE>a?YLX>v$HQ& zm=ifYX(zsl$z=nrM1kqQ8C|qM=XZ3%4Kq^iA5bQMA+#tIS?Fa(QX3totXdv_;evXw zN%StDp6DdJ&$TqG%;+(Z@NnGFtfzm+1)@i#{==LkDzDFD_Wh2P_v1 z6cw{}q#+t}IV`RBL?r%`ySo;%HgyxO`k27|-Dx}gz-}K7CH`?JqI|GDv9Zp#z#(eK zZ$Euv<+i!O85@H*jLa%UXRYS~cF$a=WjHqod`nn`Y(QOULYN9 zz0MeOziy36>s%`mB<}$z-7N%$eYz*5vvGgnz5k8X{}CYl9pR5#4#-Ljo{;Q_9dLjx z4Ma48XM|Obj-Mt<-B!P)gs9Au``S#3jY_hOxo@sPFY9LXK{yT$A*YjZQ5)S~p1Od_ zQgMCKH6;%ZR$M_#ag}WK*`>XDmpM>l~%s6u{Q2wSey`ap6u zt7*Si!In1(Q1?C6{~aVE+z={SNjUzs#{p91rc~cgqLPhM@f$V!SF2{xFB=T~hHekg z6|jJ;0sAudpAZDEEv(v^mXZWB^P4x%0skxU_MVNZfXYj7(ga}PC19w)$kV0kBr8{U z_nft8h`sD~RxBHRze_wx|D3=Kn=Qe}svuaufmrNWcm3Z?3b} z{YNfqE*+!JCbMreGw#xlk0Rw}CqQJ80GS=InApZ;zDj~W4fg-8pTBOV2nW!}OZ{1h zSd#y7&i-Y5ryl^Ashk!N82D+Yci{ygp<@4X(ZXb&1A1Aj>na`AE)FrQN5$WFDF5%x zk73RAL@&pe337o#)-h`L^v91xMw_JdIY!RTl|pLG^JO3%!OPVK8LWL@Th6z?>VpMS z=ivpC7=$RW^8Uk~{(E7FGU2e!R6*&kUF}-dJV}Hd;Q^uQ^I12b_t5?+>sfkeGgFCuewe(4<@DZi1_gN4j_aQ%{K^?MuDz{{AFb2dC@Tos?f}Ge~ z_PxLac;D-TJ_qyrvX!+piI{;N__#}SzVz9*1>8Xn4nS~s247wp$hlm_A@qsrvHD_q z51PFzW}Ju;_9jl{DeL^YC9*aqMdqiw0gwtjwO!5jrOq5k#FY2C{@Ka@evSUsdcUb} z#vh5+MWRH)gTnAP%9O76XU>>41!3Q?%2sMICPNeJ#_&cLy>-!PhuB!?*( z_Y^+Rc%LrooSvO^o*Yc)nvalKOqYDQa4URLd#EtKCIi4+3&$;&CL{i$up_#@2ZU+S zaRQ^^vB^3O#86Ff5hSx(?H+btlDbUQsXR=lts8Ri&x__aq{;oOri*rU8(x0e9bR5( z1duK+90zv=*dD(WoE@AeXtr*<2?{?-FFIaC@XI=nEQ5M`d*3dL$!$CTHH6#)R$i^+a?wfPHGl((HJ&1UKv3SM z+;Z(lhvm1eJ}ZGy>zzpC8ve8G6^5O!8MW`xn%o#rb@{Qt9RcwC)T>%8(vlX#i@A2G zFL7NqR#E?9?M}XhDeNayDqz09hZ9^*pF0ff$TEzA4lKOD#r_&y!5Xy`E=l==a^HQC zMWCNJcQCMZmTu;GNm=!4HTr?({QmI0?(bH`St{;AEqq8&#SXQsUn8%Y+^+R^9^RuY zW@U-$a#&E5HRO~p+i@qa9UO!uGNo)}>UpJs!dMRHoE+2Ep?A8TC@GwgVivP~*iCCM z)+}b~A~1grr`99?hQZ_t>A|zyI~2MJ^LmX|wPZ9B2y!f~B|Q8!_Fmy zPP9&KY=DkQXs0He5Q{|(1{sq)0p(YN^e=JQd3PK_*<2MeR9vqys)~4(I2<9C<8Y6h zvYBflEo(G_M@2(h8iowjU|k-(Cf2Dx3SHK~99MTt@4?t@9V_#Dm+*bPbn!AVnG*BUM*>!MV0;L?Yz zic$T{^bU+;tG8V#?77=Y`tE)_SMXemR@pp+`O)$Va~uP;OM3<%AO-_tB|L#Q+y3+{ zlA5|R|1qS{CQh`#*hf(-(a&OaHTZp_aqdENWLDvVWZ7SHD;flkh}nP*Bz|4AnLbL{ z8?%27NzX!Uealb{O|xoIC(|SSESA7`y9>U^WfXA$urPTt$I$b+>J4}KzN=NNZh9ZY2-*$F%P#7x9(7Yu54<>mH( zEFQ$RA){dQ(4Mrw%8L09e{S6xc6cxBgT7FYuQt>2aa55cav{W4hkGA={QS5$$GATT z5dmo<(^jYA6k0CZYFC+baPxRz$?N#3y)9C|6O+#EahE}c+MjOLk@OvLEEJt)xv3>_h2tF*fhDuu7uU= ze_L%T{CQq);9XL#1>23Sd*b@o-LMzwb^5?F5e+oDMwY#XJO;C}>nZdQR5c!zRi!5z z1JTS?v(ypZU4w0uZi9Qyw-^>>R{Dt}DY04R&+|27>g<*f@2?h!TII62E z9JlNnS`_T{_}#`C?Du^DRj{`}W_r2n@ddTlY|r-pW9%(};_8;Q;Xoh>gy0@DxCeKK z5L^d$hr!(m5Fog_yA19wf#4E!aM!`z;h((c-gE0e=e^&#RlBGureN>2*Xmx~Pe1*v z44ww?wr9p`fT1vK4*F@l*&t&5lQLU_5j8^ckib9cG^e^kERU_uNB$bMonSr77&4~4 z84L!MKq&6i?rIFBcKv%~w1#TFSVEUbpBx}viJSl0TLOj9?(iqYlZvXDOXPjqQn+Jl zR#f_-)#KGJkJY_V#PL+24pp7SDuVsn>)c)VB*VBNmgUn2sogV=Zu73#Ku0J6c9udB zQUzJfcL(Jz)AHK=VtOB4%yLh_>`#t`OQ-T8>7P(kCgxprzYb!-<$UZAd3#5&cnv!2 zwP|Pcn=qb)X+zfW(C=9Q@Zi#VlmBj1gJ(11b((-twC#)DB+s5NdeJA`I+(l=ncs3= zEb)(?r#+?bRUdd~SaLm4cRE_oc+1Mv zQib!pP(uXeLE8}t$Y$FWvAv9sEFSFCgX`C4WqK~l(IECTtapc zu&~%j1|gn>cRHur?9U{xymva8g_jTa{hEMz=1j!4z8`|?GyHXz2j}my3Lz7C@-;_} zdLHKkkV8EhSFykLi`8nVOGZo6vX6rPyOztnz~86u9#2PiX;E7=O`fm76o7)fsCzHL zLU(C4>sj>9kr%m$>R(508ZkBXye}@RD|IG70s%Lnn|hE#HTrjFm3X9gNIxk`+m2(J zUVl|8&>RLTAp?eThqxkVWg@cdE~8n~m_$m)pt?QImiXXd~Zt53WAeFfjRuZ~S#dk%o>w6?lpDg$m z2IhIrmjpSlZzZ^`jgS8kMR24)5GOr~y^}%s1xD1V{p5U%7dT7679QF6aMB*g6B7f` zBExfg4wRg0xw75O8CH>8l4O$aJv`arStYnRUW~d~dBHtte<*xz7&z&y?73MThcbLt zUZRfpgsTHbBwL+?*o?h^86Rkc6)a-$XdnJ+2AgJsp<#WVBfD9zS-~Nb7(F|M9M~Go z{RY}YnYC@#eTz+!qupLl(ZJ7_eT(Uh*SC2nBs?uJL>^lR?F=%bSC6LwHyfX>1hNWO zKn36L`mfgf#gb054}FIXR2yb=VU zV1Vch!9*0F6O%{As_1CHemDMA$Mb#K$)}fIFL=GtTR6d|HcUlDXXfk|EGXAX=hmwa zT&g#LW0H>Z^Y7|DW3(^`LCA#6QmYpybV>S0Lk0EE&xoDRPZE%g7R;9DVoSNT)Sy|s%jBqj~C98rp%sQSu?Vlc{x4Q_gq$f(RSgN_$N~d zqQq7RljbyCbew%IAuoCIeC|f6E%f9sJSfTi$Sw@d#L>S_62B0ZI#ujTbZ5{>D+>k* zSy+_c;!a8oZE>jjtwNoCd!1<8ip#@XunGx>l$n?h@1~JX#avo|bSq`QJr*A8*HOI1 zQdwU1mwo&rFS*u8T#t*4TaSeYD5Tw&^ZMcU!K9CQ&d}VkvPpOW`fgx*elTaNvFexcIzH7IAlPD-k>IE4IS?Oi7 zwG&M;XwO;HHpuIg5wBoUvyLZapXT7~?2MwJo&8Q{Ky(K|R#wuv5G?I$acrOfC41}Xm1##_~ry<#SX@{rdv6W?eGzrj2Y1xL?0^2pfYXj&Wr{;UygL^s=iR)vIwjNe{ z46>N&RDoIUcH1u4wUf+(Nw_ClJTh$-`J1db0+Z_{c9`^nc}&$2mBVsDn`t;=dMCRW zE}5r9sX(cMLmeLJLPcoEoGnNEK>cvVo@Iji`Bb=ZTqd&EYK)O~ZO`0daM0#~a8cUR zW~KCl$Cc_BEh;r$bOnQ&iM*Uzci-<*ySjrpti}kRcdUhSG!x@j$s5C@1_roJJ@3ji z8*tBpH!p1l`RYq7prG8-Qp>~beJMG^4KNzHAc9miNtYQFotnvK*(SiFnnl8ULT_lR z!F(CJ$Cfw@o;KM_J>irAl;9#;@kGb6DbniLYB+kX=k(jN z{=D@I1>Zn=r5}timzTNV#J2onnbGM| zdy1i+&t~n+j7X9n>1k6MU49I>&l_EwEirnV?yKv%#J9m{^#n{4kfDWN64)>$M73Tl zWFLW`7OB3T_NImzx`v&?Ceg-v9l$wl(y`&BdD>EX8<$Q#Qvl{pv9glD^j3?3hS|0b-myMEX7 zMwB8?mPlgbnzvRQ2vlJE0`K#rVxdBl0a;+|IHYa-R_qnCTt9EckvzT2iB4YjHLj-Y zDHy4L{KE9Woi~h_4zHLcDxVLT?j`0%phEgW>eXl(r6T~D$mgw#?aTYLm)m9cA8 zm6TPoS(ln+u|-74q7!2e8%C#acY9qFLev#`!p_ng#0RxlGP4jL>qD)h_-T74786rq z3U`tD3#Im?f-I@92%O-=4nUI?DR}#1lPxTTML-9BJ@LM!(w%S4)*Fu0Hca?zR)px$A_CO(Q$DRGVFqo1;Fs-j#GjK#Nrrio%SLm z&z@*}G<1t@oG@Z!JG3Cx8QS3SA(yluaNaUVsQkl^StK9bf0Sr{2uTBe&gOSeG|BeWwNJzSVkJspx-DND$PkKIbgc zXw^*`9vx$BH4fT*Wa)TByXg-SBuuM%j*9STwqE40gA;2!x$STEeYno>;HE4@dz~wX z^`~a@F9^sOvfoCxLcfl*f=Z1zA%zpr2>Ri3qxH%+iuawaOB|o_Ae)J;eI-bpLR%$e zJQi2@Y);4E1KO$-FTN5HJXiEbe22{=_4Y@zMx|Dk%n{Og*EQ$lhby*>L7K&_iwu>7 zjJ>1ugf9To8%uyQ3pWkgux}=nXjA>xW{CO11_A#|(P6ZHMcIj}i9ca^8S@COB8fEo zBcp;wbR;$=dhYH3lpIFKNkY5Zx)Zdh(szd5G$xo@w!>~+!XjQEnPpIFd}ad(y`{op zOH`d@2e^zpgI9iZHd?6^*oR5Y4!Mh{1epaP8cKmcK5rq@!0a84ZwbS4i`r{NrQjP$(uQW|+IcmcnqSy&09Hbc&6QQZlw7hD}y z9O4nbxpZc8=J%G37`da>Vs&({rfA6rgw%!OLfiMGHoN>s%Zo3Us$D-kv)-W)& zI-o+~qDz8Y>xHcMgZj&@z+j>di(yBVwW^GM<*R0hrr3<_w<}svX8KQ)+#Qm#mE_hL z3oDwV2{aU*CXtgByE?^%7w`LaJ^(X}650D)u!PdDpAT%Jpy{vS;;i7=uYToAT<2U! zNo;j{`erlZ*KcZ4uzD`r^*nSc{we6je2ouxa~I<-%YggGT!FWXO~;(OIVMnuq<=&e zY;VyCL)(Q{7*J)1z9>9O(b`IF@ds2<`_KVn&FeDpa8SBtp7k71eRh)&Z3N1-p0AP{ zz3|9EZa69HI*Hx9={uwEb^=otWSL}*7%a4de>ENN#Bh1+eM_x$NdR&(su{tU{!rb- z2~PU+>esB$=Gx6FpIvl|B9EMDw3<%_2HvsREWL{iA0|i`r19&A00#a#Hxa0T0caCqxOziPM}h5(J3hiDMl(a!h%_6^*(IYlemt%Bfa0ApH5dElruJy z)9+1wtIvjOP_0?zys3#*rnP@asgV-7Gs+Hd zlEL%5I?q!1viEt3``F(31MQixd*!V=%*OT6Y|)qkg#*s$9lrNlo#0^riOMe<=PH#Q z`SMcA`TS3R~uRgeM2kHh;1vbU0-pQc=Oorwg65{wf~6%h!}Bv1@m5!G!K z{In5>YG;t3dCMjT;NDXFA6#v$mWRKu?EpqO>xMk=R(yAzFZWiknB+WOpV1CG9`42L zL6uLWp(yv}%iW)h;5D)DXZZKnPvMo*+Ih<@&VB8!Q9m>JA5?aBUr3>qpH&}zo$-Ua zg#%lVU`v_XW6T2@762#U4ay(V`dYmt#@W(E-04NE-bDRybMj-~u-n#Ik<HI?Yz3lzjYX=*54uXZ}4wHdD%`6`(6EJxDq)jm2tm7=-0dDgeO$mNTno z7}UN3=rxSfB!+N7{HUtdT2nk2hn1T-X@o!G>pC3&R>qsMFHMa>Hmz5X ze&&LkFNL!Oz(Z(R3rA#WHF6zB3v`0R@zC%J0wPth+D!7gI!XT!%7R$Np|WGpEovK) z?`+y}?jFqXcvEubc9OwaF`uhYq z4}|t3IJJX4Pw=++DvgBuIv-Yop+tkQD1qc~nmE1g7&3_&ZeoBVUOIi;!QS{^c*DPJ zq0o1D0(buYTn_kIr9uSO7Rt(3a!?~epxr!CNi!}eEWjL1a4$8&5)^2Y(@8LKchDW% zcpyA*_@IHeU+{*M(3|#NJB4}sD!v&Y{XS^u&O~N@T;E(=@58kq7Ss`Ch7v^;Kxax} z6uxvr%lAb!s5n50h2P{@0ZNOXU-gRgen5eCqxSpIq|u?IU`#8=-HC0dnQCavQ?v|R z5BzwSA*s1QY*M3_`e?9I0uOf!Y8E+y@GS50T}@~Gt?+`7Giq_@cxWj^K(j7%#>8RV zcz^9%7fBs7x?be4EiPT;Vb~lA|Jqk-KOgtMiE;n)DF0Iq1(5#X%Dv$QZJ4a%{$Fkc z_=_z^%Lq~F`;0Gp`W`8$(wAgJz~?hQ0H7bRt`fxe)Q^q*)26YEGhee|1FJEq{^I=MTf82dJ@LF$g_&zhBlZ*T{!xJIrkWC{ozX{ zd-f7g|LuQ%%_?vG@QG2O1U7XQQRDnV5<32Jp5)AP_OEtq+v=bEdwyQE7f=yi(NaTz z^g>O(Ea7kH=&^Yn=#@fm;+Z5+Nc;e`gF$wkd~@ETUxz^fu&uT;NMnlsI>>*23wEh+ z({O3*t&*nZ)`&qZ9r);L37&DD^p%pT<|+#QLy-(ZE|DML0k}vBj?(&ozWC~G0tZHJ zs{g$``=1N1v{oo$3^jpUv_Qs2{GZZ8_s;XpjyQ=7TI*LpOd_J7krA0WE0ZGK`-g`T zhx#vDTj`p=6^=*2<0uZ5nLC&XBfoiIBnJi=zRys?+wgPh5x+J}yUVe7O)Ms!jJBnR zi7`G_OGYyj+64FBg=hfWEia$Pv^hb zw=)#=COc$Z_icAvB=K6e3iUG#4r|R#FRR4{7--44rQer*kny- zEq<(c3^?ZIq+2czi!X;xt|Ne3%T_awnzh1X?OT7dI6#R`7guXFEyoGlB036?G&WlM z#?2Xt_doM4)>bi#t6>!NUWAMbo8P`2_D}n~&fNH4`X=&!>6=JptK7TMX7TEGVKqIT z``stU$E!fQ-d{rlV&zXH3sA+QWFBXpGAh`5V(&}5zHq#f*SQs7aFadRC1gf9;{p#n zN|fYSTZ@)EbcK!b<4sDfes#P~2iW;{SSy^t*_P>C0~4|D$*8EIl%i0xPd*-0-{g<# z5nLv9-hoGYkqLLB_UqTVnctf-{xz@tS5sS)L7bk8ZHBUS*?rA3)m&m>Qlq2ynCEB$ zzLKawRCcG+W z+;%WB(lWZ(DW`bUs<+&4kiGx9x}lNld@w7lS#32vJ=e|?(4cUsPE zLsNFLzR%6@A94?jt`G5=Za3Z!@rinQiIfPB=1FWy6S!U$U-IKoCzx=r)Gxd5THhYG zHbZ$3PRoy#46Dr4KR26)7Ix1_j{grk2>W05;*yYR&XU$XguV>*M_(e_Mvo}e>n%TQ zK}m#yPxQRJuE$WeDXG@zIt_&b;kF&L?LQDIAD^&C_h16;{@1$ffZFn@xeF3<_d~3 zd}ND}q9NFaTng3ajj?sK;I6;Py#MK@{$cZe5NZXZ*SV_F0$xv#F~Ct;FhFWeF8MS$ zleo@}dr3UZXQF?2Ddz+JG?OADnE4h;iD0%_G7l$y(POik6M|=}?20Dk++P+ALh?6G zmx8M4@SpYOC3D)wAYv|jwwxw}8c>T7ZaA(zhR?G*UL>KkgUyAhFx2Lo-RP?Bx~=Yg zug6{sgjzmq20Mv}*yh}gY8-e-4GO-{Ai9MM8x>P+#aRX@*W=+u`NI>U}ZlYBCVr7@;&>j~buzRg_5n z{H$N^>;7bc$@SqN$JP5#%x?5?#b`%abhAvTFMbrBw)}VK1`*WM^P{iZngx2 zQij7KCdS>Q>)WF7-tQCsjJ{3#ODu1fgzxdx2p_IqSR%79sNBz+W@8(o>g*lm3kXlE z10C}|V4{=2{O4;R1n#^yzWlh;!hqu{TRmmS!T97FX@&?BlV03#XxML=_sJnGY$)gU zd%<3JK(O&lDf^C>UEKR&1!_^9a*1X3V)g1=hsRq4Nl8h2^0y_c>T0eHcPf5(1O%AT zTsd}0AC)Wo33#i9)iZeHyzKMbwqQ)6$t4Fnn6`Vo!@iEYOw{=FYiKk+g%gCz9!Z4! zBvmSM_&!~MS%Emgb?21tN~e_0L(dk~!hjst6T(~}yCHxOErd)OopLr)zfX#$80*m2 z$ld|uK8|~Z5dC-87Co$AGF^){f9P12-kZ|5ea;6Q(4$%|SZ2aeRvxkr=Ue3k2+l1E zhiXxw|M{3A8i z%k{OCbJJ5xhHmpxhZHe=9pXNbQlw@Es=P}&rw}br&p{o;iLum4wcBYU;LiNiicC37 z89@e-ZbnpVj&Z*ji1{81so)&3ZiOmqeqB4qc-JeU6mg2tU^@THxGHKxJz0Pmt291~+r4Iuk@ zk?g31mx|^vBK>zG`0rtlwNt@>+ugV`O=VZqOz9jSIO>nqPp)SkN=}u9`1O0yA?u}6_;9XIWQ&9Pq}Vc#%4K<>weQt zs){w)?y1&%G~1z6>%p#3?Ga%Cj-?nT*k6L${Xhzw(o@0CNW)qkkI;FC+7?-%cVPz5=@{N{=(7 z`sl8B?|uMVvb5<{8^Y7*w1m^(prIKoe0Do+_rEUFu2qFhmr&?=--Ynq^>&&ZFWouF zl}iy%Uqa7sNMj-7t&1ZiZeu_*SZ6sbT30G`vbgiuD^e=AG>ID%gqjO?lF%`jh*pal zX`i{;c42ni3J6-YmYP;!V&0l#vHHA z0QZ%|4818AGgk;nd}IGMZYPd4ruIKQmfbeSt=&M8F0cEscB+AbgVchHn_Cl2QhYkL z93u^l^&+qUPo^#Vjr8-^9}L>7ey@WG#8F$F78`sC9{$TFD!2m8$9hLpWBzWV_~(*_ zez+CYpW0E8p4=Ht9hfW_mW{z4O}kfSTRaycZB<-8_)=WI_p;(4uLDy=t7K$o*jq6- z+VBR7&Xf4jqh*V*N#Cj`|EGwdi_kAg*Q+mwOYFWB05sXir#Y_SA=E0f7dMLfHPnN` zPjYuIE)}X7wX#qdrJOlj%?gQPK0hBGA8b8oW5l46OZXCjKOp+jqPK>B`-CFUK_I)q zE%^=Zkz6RSI4e6j;;#o-J4d5NQ`VHr{E@&~Bwqv+J40ti$MKc&5AZLLEjlon-LiM! z*q3SXO1cmV26_%8MjRNwWM;^X{uua3W6h&WbzvqSHx+^)XK|g;c6ToMFt%(mb{V33 zF+Y5}5DDE<%iEwN(tNuZLx;=XFx*A^=(OUYSV?&j$b-%Cn4U_74a4yu>JMr4 zEKoDeOeoouxGO8t%`^3PP6w*dpQYg7g<>+0N)DQlA8$!YJj24jmsYT=A`o=xwWkMt z%&6#8JQo*Fv7D{YrsZw49aKp0YpHN&d-hOlZ$!jlr0u`tS=Zb)4xv%jrBwrw`i`f2 zXr#>-PC3_j4^whixq?lAZ++shXaRAd0N_`1^I{o7&Pj{&^&0lWjgY5iy;=`qC&hdS4qnMzTe#3oZKEAfVrf|$`n-1YTeaYVTi_7=3!r%>wfa2 zP+UYcAB4=QbSMF6v^ruR`ROjkONGMeuF@LfG3#wKI(cR$DSxLc5SN*@ICTyxUX@v3 zjM)CP?~00$o{$qvwVyHRZZ!@`W{GvW4nLfF(a}Ds2O+=Xm4PKkMTJ$WVO{-iEdW;( z{56$vXjJi}^n-}sMdtZwZ|Zyq#5DB!W|a(@r^ka|ZYrQ0^_O%=>oo5w%7ezLJ)+dh zv-eiv3TJ69hqw=(fCyCP_deopqn|b788?GCCoh~Nh!Ys4^m?79f$@x5ik4vDmJ_2qZ^4l2XAHh)TCvuvV=7PA zlXwl~e(qAMnRU$&mF;a}LElZT#F>clwAuGQIAEtEidI(}S1lbEJ+OXzWyQgae1aul zdv85Jsm<78zRo7$k=mKREg zaS#^B1k#>wsu7nuuxQ#G`X)&R|;qg@Ta95&lY(2LdL zp}8vwM-2~$L5}(_OZM7))DOK)um!Xg_AazyadB~@t$3Jc;N>|r?2$~;v)G2>krtwr z%*v$P3{~FS45{p#NCbvOSB|iCJL15-r|VtY)iiiiV=-7YGQZe@A*n$nlk}EIan83l zC-yvjgyV2F5XZD#FD9>LLS`{C#&wO z8k!|igszfGmBNnwX8_P_{;=yw`-?-qkrDJvXSKYc)fxS{@@(Q1|?Gjh0lu4?w6_3CCT5;XYy6y+(VNn+GqY zD08{}#jl<;-hZA#*xf&z!u0Dvf$vg(2A2)J=*59?C`beogFg9(quE&+9(I*T_uIrG z=wfAc%#>C%9UrsaVY9B>hnzXhd+UH^D~etYXMPVJlR8LI^&5s~BKQ;MbCxZIc_pl5xs-I#8I2i`B|E*^GETQc*## z9xK87pC4Il?~QA5dkCtY?kx6DxjlEnt0NCf--oZw7@e)I-~`Nv4qvs~#E?PkVZ&B< zAm59}zAwZTNH?a1O9T!Y`EbVNm(M)BDZ}Wz+d4v`(aoTBu8qU!D_ml!=z2X6$?y># z``1eED%6U2lCim9s~(@P<6JL&WOe-AF9>g>YG( z=JkASIopsYL-dq(IA2eebh$NTu~QOPz>!|? z*=B#fe)lKJcDF&L$=qf3J|6wi^ItXE<*0{=AF9$Vr`yWy*Ih18NmKZ((<@GwfI~mw zAs@VLlDJtZr4J%?_-5C`EP4vd^J6qw;P$_-o(DQknFN2V*2Q68;3GyR=c-4ehShi2WslWT?2jd;wVf}w32`&YoVs@H^DW*u zC7PfLvEs#6eqL8xI$__X-`(xAH0_KNI8*^?byh%n&e<|`t4QI&4}QSe9ZPE^kdKjL z+SM5%kq@OEnKdd5u`FhC z!uHX9k~ZsnTAh>*3Ma8>7g0*!rxEJE-FFkvsTcpKQwtQ5tk*>T@FC7f0%h<~V~QZ| z1q-B-G;4`KfQ@D_dD3))ri^@yPxW+{mbY@uxeU22&#O$lN=SeiQfbnp*Da0jIH4S{ z1gKZ2v#d&C``$wa4h{0g32*lKF%5k5s$@XQogstVkFDbbomWFMmL=N~IIPb(C>PJE zxi`+Kxv$~*bihxLH%&+E%Oh@@BFna8Y0TpN+)Lmhllxx5hy!!q3Q+LQgEy{s*Dd0( z)dT<1MIx{Bxp&+h;hnNZe9L3oagczLh}u(+zwfxdyZ9kwTOgNOYnb~!>LJ^`>G(1 zQNnl+0+%vLN}7Ezt0QLIZ7em!?6ja3%Q@uBB_bL+-~!T!*{^mDomC#;HMX^7OUbwi zs8<{NDOS;Iv4L%w(8zr8Ef?MbMb!MXbk*I)vsjngFg|(5)qHu81}V+0A^`Qq8Ne4U zizp~jzi%2dVZlwi^_>du#!>`?SGyo;9W)-+MS@Q^9Eq;`_HiY-4l|lHz9(|t z{d065(i%BV$WG;6pI!L`^D!3;@56roXyWftzVBAv;TaFg9{-hhzFWDPW``4LCTHQj zLojEHZ4a2f`Dst)`^BDYWnuq>EBxRM=ycK&o&cYS^6m9uBHn#?-&6QaK~hCrdcjuWKx*ZjS|Qu4PbHJv9MKExEq` zF01{%^K2li(?n%6-$I1*RXcS8uHGF;Bu=UCMA?sxzHaQb+m zXPBF`ePB9aeHcp(4L0*hk>$b?5oM~*a~yi}=kCV-obN17DYT@!K>BXe@4(Fz9QfLVZr&hFm8g*1Tu%l zQ@tpJB`_g3&%DoNwjid_qF<3Bi!nL%!2D32|xXsw>m z)Tu5zoTKGr2vr;PddStOT(Y-Di!o=oEUwb(oV*aRXdo(qOuBYtNHq*Wls$AqX}qVk1RquhsII=7rW<^y5A#0X%h6R$?UkfEwlYRB960peZHVtJNW_qei3btrGJhkNH$p6Uj-y;*0lJR7uUbEF_v3T^$PqWT>^a<_!@CV8?ug&Vc#b!Z zXy0kP9UPuw?{?2lZ`rQsb`V?EdO4foG^GzbCuIKigr#SsOp+O)Kz;dakMh3fD& z48^>DOK`JMcm+J1za%V(>+g88(ejg@vG23I|6ERyT)Lg6`YYG6yGu^1h+5u8>q*b@ z(7aHGmM*wNBXy)W4etE+S=sU3cTZStobH%}VA3j{`rd^g^dY^oyQTBIPxQJ!Cd56} z583n*h7z0t#c^Wj)K9kxdOYo|N=JqZ@aA>7(` zs3$mni4e$#pHmkzSr6=>qsyB+NO{Kd=QH+1oatZmce&@FPZv0wV@+ZulPu!>SseF_ zHX4oh9-X4Rr{TT)!EHaLc#m(0O@@PJb}}&btJEVE8YeKWba0Gb_i?l>YHolsp2K@^ zJMTYuD7;9}mY(RvZ8e^B{N7y@L{AXBdO=-kbhGXG1S@U1_w&b()34lxF z)@Q3NuR=v&)i7YF`ALa8p4-j@UTqg&b@M)r9cjsE;(0SQoa@SZ&8hw6caGrF&Gea+ zZI#%@`a^miD6%x~^&8P2xJ&uT;cO`(%ocJG-%CU7;;CY!H(RqZF5>?^kJ-SN!8W*c& z^T_q6MjnC`KC5X!;Fv&tzGiVEV)RzYW%;tTE5=Jmb;ifX35f#;e;6JS#lL_bY6+JT27H@U7^aZUKIC^GcZMA#*8S)dyTo`Q zFCW)R8ab$7a@Y-HydFH@_UJ1)X694TOquGp7tyl-kwWw+9H(G|y$2I37 z1ZVR2N^QVN9tARxUmbCA`PC**mtl8o9FO}uAlg!tLZp^|$PG6R172cj!Iw@yl;jX; zbsnKL7rYyE%H-uignFe0xWs`fCn6KA!HAk$b!fh60B~TRxpl)DW$(T_qsiWkDplfN zkx10qbHwKz2k6th35(F{d{8M@mQQW+qi`Ystu_@{X}3Y!cpG#mJykmaKbX zS=iPIp4m6XTY{ldN1m+!&oOh%ludmt|K%F`@T~~+ur2@NgWg!Ml_qYN9VpG_8(DQJ^y`l1Rne<)}l-W0(dF->~Y8}^U z6j!tx2=qR_u733@6uiFbyR*ee7`m6h0o-5qU?_W%1*(-8@>naz3686|?lwf;u?F3H zc4|7eJTz#hQfA629Gssz-Q)uLw4mUYPR{RM!?NC9w*#mB4z$<2^Co5TlkmuHD1#vE zmnyfQdQXh@FQ-<155%f>ynqxVhPtn6CB$#hohROqYpz+#3f3YK{N)i~ePwfx${o4@hUec4(xCOi;tYI+ix@dxQ?`tL98gd>y8KSTGC zl15plG2TGcvmR;GN`iSIU;YCy!_8G^qd-mbU zWixfu`I;y8^WD>vCl87Gv2I)Rrpb3*Lmv~TUHls15&tSG{wWKRVdi!POFqQ)PgY7n zk1C&c_^Y_;CxrHmoiv_JeEP^(L?m6)-a0Xs*~EFqxvCFEww|qSEKt&3+Axa9S>4Q# z1qI`yI-X9Ek1}0YN?Dm^UPGM7v@&6&h%QGJ?MKE*dTuZC&K|;S=aEHD`q`iCy6eJ# z1q}yqPd#^osqxQ$J$;b35X!p5907>ZAbE#>kXW-JR2 z1}aha%N1WyLzonrs_cPQ2?8El(H$7%C7`C<8fhi0YE^v7G8QuUFwB^qQq-Vx82=FM zrh$=SA$wjRgiE5jqW%5=&KF_~5^T%H1m>tl+vNqV*4Dp5I#z$pkXZo|-1Yla54vKY z1S?-Qfet4nMZqCw(Z3Zk_UM@CrgiEmu2*ZFkuV%Sq&Bd{xspOEI8=s@7Hg-ZI99KH z)^6Ys#y{pHuA5Gs2-@k*l{vqyM1D15uwyKw+ zHW%;t*A_!1fABqPt8ER3mgzLhEJvbzHlkQE>)9qNtA50~#CvLtdVL$AiKytCiGvF| zOY=(*Qtj2#&8$G6k440$rAOg=&gDs*7S^QTV(XRhfSz?TSXp?-%Q5epcfAV|q@vNy zjkj;*vOC3yborSVXi#1;QvT2*@J2`@NE``Tv%IsM**scYeVEZAd@gi90IF8f^_lQp zh%kV>I3xUP(7;`cRk+{twY_vVI=)6cX=pUqXci5%mRd4!%81oC3L5Mu= zv7($GbL~^k9Mn8KbX zPcGHg2ztTrHuU;Lm7+3Q>2B-g) z)02VPonr8lyQ%M3cP!pX8J;`wO7rO5GS*d+(l*skb9GW4J0$S3(vSN^x%gT=POv@h z3VLJ->@>c7%qpYXak*ZtcKa?H3!-Av+kV)b5MQK5GuFk38SB&1g0M#y?u-gpA`(Bk zAFr^o)qejnR{N7P2qA3Y22xkP`L7bb+b26nuYLV!i+()z+v$6B>}2(JpD${Ot+a!& zOnpOQI@EcN%3tUk>wd%-HNF$g1p$(Zc$9$W&p4rEMdv&;?!r_%k62AK0dqQwh55!= z1(nKL)@|9hz26~Gkg;h%TT4Z)N#A$kGB<+{!6AC;QbOFRtLlLS0FbQA>Mrw{P`M$O znbc6T6C!s$*$&XOm-@nKV}@~1a_r%{Zp7-L2lO!7h!?onh8H)jb0GjY6d&s9jyho8 zMc5xnU#$->I)J2(FAJ2pqBc%3+C{i7L=?pE;sEzhDCfs@(iJ55cgHgt)yxcoU8b7S zpal!7rRksO2-x|mBN}wI?Zo#NZPYlnmOyYnCuJ@ajjqJz!GN1}bg?KrE0SH9?=ytzzW%M?h8Hx?|{M9cteD?r+9DByA{RdtR#MC%9>17i~+UDPWm%8hPa zTy^XT;!0*Ymr#n1NwDo8l!ia=9d(o)&7MzIbQa;UHvvHKOvoxmDeec(A$(up_n$BR zc)-a%(}(1>@r$3DY_AUXN2I>ExwlZ_da7ECDD^VdhdGxY_VZak{I^LIc^vgue?yqP z+q6DAFLQeM1pi<=U5}#S-J!XAaeZ|11Sv=Mooc&cM+INSuY9~?i!7n?yQ0n^To=v1Fmon{p};!*-Fid_q{x+ zzsc$^eQDJDTEa}CZmLId7&6*t`q}=6lQUrbr#51(7JfwWr??D?ent5Sb*joTf(IgH z>(^U~MXI`CH2L117Md!W8vq0_d6U16Gj_v~YfEbS&w#6ZF4WNLGwcq6ala&j%F&rE zuI{u@&byr{;p{nCX%$Qj{XeTL9^2&hX5FJcU@>k@h6h)Alq~QNUTkFcev=c_U7!kgNg%jLTSQVCXb`IfKm7D{O?}Z1DK? z%&i9FCFOB3Q6}?CifE^&;5sK|CLBO21i(gi^_H&e5l)A;%S|inpha)$5ftr3bRY(oe?h!BVwzp~mF9?DSxWgW(qWvXzh`f(KGpd5iwNon~=a6SKdmYY7J?qJ} zO#Dt^weDUDO#LRR&qO9-8*&*c(&mp9WQ4ONPy|Fez3;d+-a(2%MW<3!X5XRo>*Yzy z{nTu@wwnA6=!q}$`1MkMlTH2146z}J&3sj#B!WR@&oYOP87`Inyfw|BG$a`qA~+rf zM`)WpAOxi~lU`D0-lwAJWp&$BAc3o}Tt+ z?IyMtw^q{}t3W*8_i4~Mz1|2Bk=u9dLhmgIZ_p6mybBHVMO8oO7C$(o*@Un7M#{`M zF7K%D;)btQc3I~2(4xmWDfif*@0_C6{=F%#j8AWQ#!PT$T-|2;#AXrvkkQIg(U1Co zA7*!Y+$`TnN+Dg)jsu^YQH=3|F;lua*Cx^PtDd*}=?#VX(Y@?+tB7cdAh5L63jg4l z>f)ng+CacRaP2=oqr(v3R>(y52<81s z5=Oj@cZT)++Q|kYZBh{nN-8?YZ=|x-g|$hV5&~iKErzf0O7XoHTYMr&jw!UE#~o|7 z3dbSM7?=cM!P1{tmhwv0!4DDm!dH;rfB?Jf%O3m2+K}%=^b%_uY`yYe>*Td=DHj=k z(uUQTfFIUsG8^(4EOAEW!<0urXrVXX>Q3ql%v_r+6-LoWl7F|3;5BIb^ZinlB;AxA z9pk(yAW^7#vB`utBOMu_R$x799Mj6f;yHrD)RZHhSyTG1*SQPM+&f}_PtpYf=ac|( z96ogF(>?>g(cDOT_g<^B^NvT`HJ};1TN)%DK7! zGUeC7E2}%6Tz)OsD%92ewx=Pv^}sgT1aOg5_iSFi+Jt*3WqPudTzti$>YI#+Akb{D z*r8v~mk`^+iwOAQV{{;*Bg8Q3XT97eXAjr5WYEd``0?5GnuhgfaJr>UXcAN*>z%Qm z?nx_i$p(tR4$1mABLdr`Lw}9=tqbD)Je%T*>ZV6Eyt$)3El+$H=R&j_*^l^O>{&PD z10x1D*j>f9Yt%ZVV+LK13r?04wi<=+b528PJL@EFMp&R~jn5NV7WB2|=!ZSE=*MiX z^yVslZbVC!Z#uImYH_y+d8+1UMj>h?)?e0%W~SN?EOe?H*=iR7WjCS@tFwBUw^5q^ zvhV9^aH;eiuma(`nU&Q`>O>tY$6g5%Jbi?1%8h&DVqv>593Q+{Z}WsPOJ2pX=ia#M z?X5laTpw`V5}4E{?wJkJkygL>bldng1zCTkw|1Ls>8YpcC~sva{-44%za^P>zi@MU zAu5UFo{dmbDr?@qf*3T4KdeG)%^(C|8}k(()d)jYQu4%a;m{G***N~Y6O2p zfy4s^KkOBV_sDZvCZb%c;Sh)G8n&m_yoH@*|BPjRzp?0PMQ3wqBVCMAc-CZVISm&`xtQa^Zn z2`XzZi~pGX(nnV1biP8GOS985d0wz0v81*jaxV0Scv6bm=e5%=30P9MGz!@I*6Ynj zUt0Lupcl_oHFJ*)4;m3cOy|ypyW;hiUtKF-Da?AGpDw=TVjIbMPd_(vsZ#-~B1|eo z`LXYLmB7~y!v{}JCD5<8-qF)1-ZP4Yfn$idmJ8H`PI*`aC3sxgCk%9(0Wp6g=#(k) zI%wZj!56O-%q+!a0p0g%a(k~B9s0$K*n91jvhm%GpiJ7LW69Sbj@!!%SilK)A*7{7=R3`r*UyC=83HtZnTot5EaHSxNiRFw9g|2PW zy!8fg4}3WvCTnVa_fUHsv9+>UO+I|eQ*!U8EvCV4cy_J?i7HPu9=x9~ut|5xezv4K z2D5VH!CTj~P!_K!EriYZ_}E=PG0Zz$t`|Wuzn76=Hxso#8Z`0+XIS!kJsORr4(>7T zy~U_DWjuwtS1a^0Yy8qLyoPY-dg5=r^btv&<)&}tzvLiqD|2o3Ii7EpeVDAy(}h;> zAg_qafb;E9mWB zDUn)BAqzH2ozQY6flKwrTdA2%Di=Q#%5fAID?W*G4tT7Pj(QsjT;M=@ccC=1<++`K z55b^im*Dm_i7eLVH9p|4b6SV)-o7FAzygl6ab#4ls~;=+QIrt`1h&|aScs31Ug0JA zGVD+m$aMc538E3foOsb{Rpp>h;JH~9akEzXGMabBYGwvetG#@0bDQ^wW3dx{lFtNM zGy&xT$xzFTXHb{wZd_gavpG<8)cu3?jfd_0{s}v$ED=}t7s@v`mD3u{64M*qK%(ye+b2VK;?ilF zm;M3nKR=Zp$n>cV8f!Mv8nG-}(+IVqO3A1Q5j1jd5oSX2YF}+Oc5CI;^+j5~&}%!) z2zK`S^+dDTxnp{=H#!j^C;d%lUy?&Z^e;G_q|wg0`$_+pzIGwYsy0_ee3Y&y?}37f;)C!XO$}J6B^dpDocLIKTd|@JA(fq6C^R#6%*s>NECn*FzA?Nn!f1$Wu+q z80rFRmq-xChiqYXdj?Ic-gt!g5jTad_vr9N9QfXxDzAg7?5J z%rH{c@_Cp};{H7N^d0XpM^%VghS~aM=6QGpM(x7UFeo@&MPobThl?sWW1bYe*Mj!t zmqUG-7G?M&P{U@c5s#xl=-xsCch}Q!$B3VZPZhR%^%8+u=Hat-*G=;L>6TKYY*JcR zQZ%BZeSWec@{jYGIc_Hi{B6j7SjYAVcuYpzfUkd@qE!R>=~Kvg5n(lSta!00`hDUC zK6>eo;jl_*qTprs##b~ty;YV8H4U^D&&uv|yj?$6@?-f$-SaQ4`c6u`x{HS`c9Bz> z?$_6sK2Q%iH`wfPX>CvpO%`gy!_6&0p|zhmHQl!a*N~3RR?pz@Nw+mmVsANdG=dUT z88`0qOkTim+jG_B$1|ut3a%79#_8W+hRch?FY3vMBj<*>-=8Xtt!}vci!IPQEAFIO zVATY^XltmO6B8|TL8H{~Ft7c?DvZg$Ze{FLPytW3O{?rngl1lOL{Q{X=E~8Qy3**nJPDN9*R>9lW^5 z5-9`cWES+=RKxy7G{Xxn6JhaP*lXc_h9rK*W!@~8j01D5wrFP{O^VFy|H%Ei12Mep zu)1Mw<{9cr*q-nnsa@M=j{aSOzUSUZOUzK_ypGNsCWg;?72(IO){DkxhAhP0yk3>z z8RaL)S5kYsImT)ITA$41dt5VlkEZNV6DCJQfTcb8g|K?Bal81WA`}Xpy|f~-`3lb3 zw>y9qy*-{&^C7(VPGWko$u`#ob+kiR625SB3xWPs>h|u5KEeqmPARVbJZ;6%1gjI1 zJUtqlvbq}NU4z=ee2->vx0$BQi@ZX1m`&Lt{g0RRQabaX!yhPWr&0tn{GT%(?2NgT_zLUhcMJJ@O z)wHaaVz71~UM@~w(I96m-C*B4`JJE6$n)ANiF4jAOMbzW3rP&yoAo+f!r!ey0>TPx zQZ{GGTxEFD_h2iFrW3)EYX9PwF5NvuKn!e6r67Vg&@EM!Lbq2VLW%;0r?*}uuC4l5 zZhK_K?0YZ1vsgam32qEJ;##f9$l)B^SzE<7!D-XKAh`K@uv)&x?(CPx<>y~|ZR0ky zgdZOf_tJgQe0h0>e6osJ;JuiYrS&!y<06E4)yJ?*Vy&Ug?Xfk&?OOos`mH=}$6nRK zABY5eIfp-MCU`dIxUd5z_AK@mC?E{1rj z3ptx-L+|H`+-MjuVB7JHBf=asVV+m5%9UegrYlIq?HE_TYZk;wEz^6JxmHvKJ;-Nr zmGv%b{Hpi>uPH`x@5z&uat4dMqhD)*L#jMFzPbC^&M;J{3>R_sB&81DAJ>nwWK;}f zjJg&6I=;$+n$J#)g>Ps6E4{1ER`?Uyczw4mCx;^!> zb%*v$m%R>&Vxk#ob&d17J-4EXjIUACRj!;GM*}N`fg@rG?$Bb4N?xYFKLbmIATjL&YgZjJ?g6DHs}fC?zn)D)ICp%}%Ls1|ESxY@F5 zOuC=PRV!8H`O#+LbxS~&vn2t@{7!xq_4sQw0*yc zmrP}as+1{BWn=VqE8NR%%cPnMvuNS~<3vkQv{SU-K<3AQt?La6=S67^5Nq(-AQJ~5# z`s6lQEQG;l2IFh1PSfB8#fg7zR}DYqc@wH1S;>XJ>hckhw#j)Ahf=AY2z8)&$AGyJ zW7B@74Xc__Iv%5<5|5r|0P=jTSo_Ps64Vn`9Qsr(-Q^0cLjOgr)_pBPX8q=rgZbo= zUqNPXS}}1GgQ^Oa8m$W7Z8tLeag}aazA^`+Xgz+6tVJ`S$&2y$q*ZY`d5GzGQKjXP z$Q68F{0x85<;b#3;nX%x(JyWeli{kQBC$ZRo|27Ko~h-3W&!&01>oi`}ECTIjX$y9tn)~UzeatMe^N-UOcBdyeVEQ{uf0>5K6 zxWfCQjQke9y3h%)VKZx{piBsTY!|)Ud=W?k%0>(a$9;>hcHFx$XwC}24*aXQ#R=hT z$Fl5&clM7akM$EDIx)83*!Y_xaq0 z$xT#u_??yW>V%ZGpxVUEdL+rpNSkA+?InWaH(~6=Y=xkVC^uqurf29GaevA}{7)F@ zs{529DygIM5xqR{D~^0met^q>RJMaUK|V?kTWBaIHg=iAJF9`&$bG^? z!78`Lctb~kp|Qu*kU$RqQEeo*8->6{@Vn$L@sKz5O?=hYx*-b}QeK&a$Xirgkd_O< z;8z$4IJ5Ki=a2e2=y?N?Zr0Z&`=NAJyKQWsV*oseg4YgJQlemBSSmI**GNipuz1GW zRQjxpM`uywGym>_ZJ`wWCqDnZ=LQwtDJKk_kO50IxlLX7#v@*+e)nL03KR2f$sG5< zZb~3+I2O}sd3$}o)H5&=v2WLbxTxsc#?IP;D6TLkdi0L9(I#D}?c4aeywHSKrQ_b7 zu7jZ){Ri_)wssorGPT$4YvuM?toxx~~(1)>}GIQwjMdRDWp5UhjTMc?Z z7;@h9tN#_cQC+gbu=61((wsx-!|VOBm~$10nYS)^RaycWYWWVT_15fL^&KZ|?)iY` zM|D-ez0*(f<5e%Ts%1Q|*Ob3Ltqwbf(oo83L6TZbPi@h-@?6xq%psWoALr260KeQe z-mPSDb9f+6P$J2YmUhrlLl;@($st`xDvDe4cCSB;l0{^f>iP`+;*06;-8KG?KGi6#(s=9MQ2uEtzCce;5L9fsHfOZ7%AKO31nwza9R& z@agVpV&99TK92IT#2k#x-BPA-fO|`?tZMOc&@xqLpD#V>T_F}`9?(P)bu(Xu#-!Eu zjZhh{79U5ake|2lR@M6Otk-cUtESBrk$f6bs@ZI6ll=Y+k9w{c08cB3Z7LS)wPuv*H4IEgeTb0N zs-J@gD%EL7TC6n0Gw8lwJZ9_Bs#JgUD-q={tEOx$+S1cnlG8BnCS&i^(spe-dY*he zhb=kr2CO>Gij7-luZT*k-(2zyAsBMSNU)UAICaTo*)|Y&)<3o?^+cONMnfBDNn3Yk z+iLJ6X06b6E2T7k=5h;|6J;sW)$0V0Qlp-|E&piUma9GH(9ly>vjqgFrcc%D3)_&O zcz%+G><{0he?`qL(acYGFwGu7mlDU#X7UNDAnxE}6V|8g_Ur{E~T=;*!{h!RyzwGQDs7Ihv zucU;=j}==j5bfUm0j}lPXLqC#1rjS%l+p>YobsJygM*Sa7R#2J)n9yC95c{VDs}bC zrIgEMkl_2Y5}Zu~@SPF75drMvWsmFlF8Pi;h=}T!;v8GZ&(~<`#j;urV0f!ri)sk2 zf{BK%OBYJ>xnKWq<0bUkKp($%Rk0pBjmxM;$4x}PUhZYW&CQ2omIrhRJ_TO@7k+W^ zs6S9ITMb#A@UJN5k_#&(+;sD`a-@V^K z#h=F}@l)Ogo0fw}eqck|S7&TJI6Rs8YJv*Ag>*Xu7EXe&!#aobj_TjF^Z(;Q|8+UN zrz#=aD^~r9|KS^M-U02A=t447fA*vO%Z2pMFAlW-&e68dMKS(&RR1@t@Z{rH6h)G+ zj#iE;Dgw>WBR@?1ro@KE0E}WIQ8yCZt4Bq5PFT}masAt z8JY0I`Pq4a><1_{NhkGfuCne#&O?KZw)AQT*Wf{Ne7@IhjZ&KiOD$2lU?*)RwLG!= z!9=j}SsqJ`eV2^;@iG_Cz1mIn%J<%)mb@1HO5+=qT^Jk5oV7y4dH~;l_{FgiD&7PZ z<-dgQY>Xk2(1^2@Vta2r>Vd9o76*nN-E}kEx5b+C?d-}+lSL(H=LKi8sg;nT z38T%9E_7-8PgP#Gw-|n8I7)S&UN@VK|H18Xk+kIYNM7T4t%rZswTMYbc%Xg@*umiM ze)4!K`B=|m&jFD8=33Wei;J}TUNPR*YBWFj79=qNtnV|L*M7C2UE^{2Hu-kW`gX2I zegNQi)XS18Rp@B)o9$8h+*b#06{OmN&;{;Af46xmi7wu3nJt)(Af3*b#t%jdhTx&( z7nWi_bX0RAn9v%9wM?!7@Qwdo$p7|4$o2K7nIIGOLvWFvx=E3X7LAx)t5FPtR^2qu z<~c6|gUsF4E~VScCiO%HP17^GpLLqJBbmE2p2yw+o2hobF|WzRQ;p>LZrda)^*bvV zv>Un=x5Jq`Sr+Z}7opn(`I7NLHD69yRUPk`9yi81A@YKc+aI9UtB(}=Ubpo>`l*wQ zR(&3G`Ld^SscjyoMipkg4Pvb9&8g`G}+43?P z+;7hkw%^?C7LQ!r@!5z>AO{wfekWsh81^ab$vERNSqJ+1zN?C4StdZ#W%cJ3|Mf<2 zBKmp1;V>iaOWI%YA}T@!5KUrdCYluWK80cAsg@}9^BkPFhRH!h{F*Fh-b$<}Q_Bw~ zGA{%8nF5Y?o6K9}nhhYk)aJ1uwC49zd%dGq+uC z7e84nR@y?@Eq@Ay74t4->?zG1oy|^s;3BZkCcsZtSF6xE{KSugN3C5iUvD*IPb1o1 zKd(66A7nxrcKmBkF>1-RRMAaJSwh?v_%(GB5N>LGRrOq5Q~pIX`;X|13E7Wcm~?Lt z1zT#OSNgsY4e|7gDz-4`+hk^?xVSh%JkrNrjkH-m-B#D3u4hQ)y16mBod)qnk4w$x zQ8u*>EAFf6?$B<-^k$_|LW~#EH>FhMlvbv+g17 z)b9jLv2r~7A-TI;8|)=`U=|NpmH1P7Y3a{fw~Z|sL{}U3`!Sn?@e)irO}~C#H^fey zg~)nXJqnzC|K&Th$f-GJixx0|+~H3bGJ(gg}B$)5IF%U9oO zhGu}q1mOxR?yy(xmkD6w{RX0_hkI_1v)qQOoEy1aMH(K#_B>XPX`h2t^XGUyzJ4${)H8mt?;UjqBq*%m~~ z&mVC!ADl78vtcr`$IjJq#hSbvgo!OB-&Vj zs>9j%qAE7|3>f~H(gvHga(sll4Ry4`w)!)Jh62!6Ha_(Htj5Z7DaEb%b&Nk*uWc79 z|H(PeU?UJuKjC`CTZAg&QKY+D*krjS(s9fiUCLitpinq@Db&+5TroskPSVR*e~pq zs5nd%sM{kkt;g;PxJ+8>D%xvgfm>e=96Fb`cyMOZBS690dUd^vx1T?vge%EE4Ojc| zY6MNJsMTp9A^QjzM-TKP0OvCHFLkJ{)Rb&72s{WVfK)E&s$TM~H zKlKTJi=-+c9?%|@|KOVeE7ng(2Jp~;!G%Aa|LzTb5wDvHl#$%+PulV6LE(;bMT)Y- zK08lBLqnUb%b66azFLU1Hk3bG2>ST!I7SGRTmbNwa?1o7y% zOuP-{trXCjOyloFjZ&3*TtCs9mwkRa2DzTc3Q~wzE)W-<3lo& z{%S|NEYa(=wcW`!qLbkA#Q%V@>7rwf7 zLs)2NR1=<-N~J*))b*S}#VW=A+^(zbZju0IeJgw{IH@kxrJG>7~?CKe6|x=ja`adF^)))UzS^Ri3kXE@2Uo ze7SV38bkNH2k+}cyvZ6dr+Dv+9sR9&Gh#RF!`Vxo+ui<3lbJI_fF-la&0y1B&8Sva znYC*P*GUvB#%&W*>h=1V8DgEG0?Qrd3s=;N66^Ad)Sb!6UM zGL7~U{daNTKYWV+ed5UE2PxSMz}&M{y(`zLmy!ZAGFHN`1(n*haHe*I5mE8Qex=%V zdPz z_)Fgj0sfRpi!)76koRMLol^eVw@~G+pQjsCXxqLI=WTDJ`WYRM;i77Qq zCKfiZMlNBzB__7t0f))*%4KphDwvavJwsn2pVM2^#AVi^!=VR(8&^GKwHhrg_UN?g zss^!4YnK>sl)DM1rXaI6yvF@k=~TyeuYls5$*m~PpIiX$iq?GUEMXVc=yfZ_jZ0uv z!>rRJ%4xG=*-)t{b|fLP!oeaT@o-wWI|MuKpj~d&Jx*Z6&Pty_JpP6pZ?bTqIjOjR zyI!jjSk*3Dv6%7UcKujuto`~W=%d>*t?vW394@0CUA|QB!1by(se;Gj#gH8)9tpEu z$7$Gw+7|ndQ(n)FM~aWW)U9sESTvHE9_xw{GlKW;Ro>2)(vx`KH}9%<`f!yR^2k0p z8W@Pwk$6WjSD*CiZ{z0yRRrN~;rvG%H_4@9<;h$L^?C*`bsm?!cd3omP8UYWbhpIr z7Z$a?kAU0XY?i5#zvOZTugWBHU$PN8y$nX&n5WovH`14s;bXkqpGy#Yyit9=G_SwStYKc+cX;(9G(KPS)Nry) z6APckuudkEWy+=_=oJ&4*Rn>4&(*Dh)5>qqUhA%hj4YnFSHu@Revo%!N!4hnDxH<eb|gJgVRvd1vmVk7%&JTXM~-%P_9o!_Gil(p<|ig zPG}(=4PdXI%lNY2lxvroCdv!C9eL%x6cZ~==*kueruYT7S`v~==c8P7yWddv%w{js ztPQeWY3rpCQQQICAk9{vb`@1ry(VN=>Z(572Cu*tnhHbgc)QNa7H8{KxrE@4n6Y!M z!qd9mAGVnvEmi3W)Z6V%?icngAa+eCK|-;p>JSkDFKtVeN;-}i6P7Y(Moo)wl2#z_ z-N3}d?Fa!_ixA!EJr~Gk^;sr3lgl>Ap`4~UQ>Z~B1eacUo<12UlJt}{Bm29^_IrZJ zJ4(vliR4X?Q8?N%ns^c`&dJWn{#QK`tDRP%_`$NXDM zN}%xc%4}X9g;P*Sn*|C7%_kzmRaZji&L?vnu715m?Y^I!N=7aE^(N0@-ga@TJ|rd$7&= zUEYzmbsqyf3SKU?2`i#(eV(DS5I4ZlDbp!;3Jfl!RVX(aN#(%N;1I?*u8~QetTC}F zd7|m%<i1lOYVD`|14apXh`N0oxC;;thr}L3=gIXqz zh}RWB@5KIzX+peR8f-d^-@ID98-yLh<%7`RYIN!q47!z_vrQxXUYjAbHL5sC3P~M+ zlVB1+r+{R!B;9&{J|@`5XbWZ{GJ#zYv06@NMs0@S&<}w1y(pwP)b>U!17B?In{LB6~?zdae&02XuQ1u3tRAa@QR469p=JIvWhY|-9&&?gh_}R-AtH{YJ zgPyzJUXz56KEct4+a8ijVpVVlbyQ{yb*6`+&)umNAjk1e79JmZj8LcUUpHTElvZ|5 zeaQ*joG&Edl7e=?k1l2?{1%tooeCg*k*SfaI8qx=Z*ynTPt^RvKT91^0Uk5S9J9V( zHzvNS0_MgBg`D>%>U4f`sP#77N5w(224Vl}n}Sk>v+W^j0K;nqlmFB`_*14nU+4xo z6pfeCEdJblb&UR*YSZ=~x}>0*t}ZBroqZg3#0Mq^$=j{XdH>|ST+w3F_jX}v@-6T1 zob6k`9;b~D4qx_FD!Kbl4~ka#bdO47-1%w+fBS-l>eFRJ_3zzgzpt2IV29TLftVM8 zh3xupmATT=Yhc8({l)D;cqO7*sKOUcizNj;-wm5$<j>^<<-6HvPj=_C>5K1_&cQ@v{wV$0iz}UD% zQKfQ!6VJT)+f~lX6z~=oxVvH7U9J367iKl;laW{$>%6B*pi_;3W_m(?D8sK>4bEW4 zzDXC!wcBs5E4Z%OxvW52wpjJ!!SGe| z#0J7)=XsQNLVqEvE{xm#3>y%pQ*$F4U%92-uh~^Pb#;p^1d(j5cs_^ZMhj2?(6Cs( zyAKtsMrT{lVu;SB``#}2pqI~<F zkyJDDtYlXBemtV@#fVI@wBH8`3Yi>KJmoF@xU>lVM0!wGMw1Y?7J!OVjR}&}Ie5)+ zD1YgBMBC}Y$4;XeCQ6v#1YJYoEr;%X(L@%8 zF;_eDb3Q*0Im?t^n@w}fn!lbQCsg(hp?2W6D{8=z+EIqk)vjw%@16Fp&NZL#{$`IU zTEZavOhin{r@R45X^tisS+q&P`!7t){S5OrL1^XxEBVVnf+t!}4hK;G5=y5qezCt9 z3l~VpQ3%2f=gaE?jqOUse%%|XMI?;i9!ujd?1~*rbtRExSktCOF){C=;4TasKVKLt z)%^U*^GK#f=c^LFcms72c6*91I=J?jzeU$f?Wchlfq$(?L$POUAS$>hH7c5dNx%s* ze}8)}%^2xzFQa~Bv)sVH*cvhureOjd5tD4-@~8gS`l6rCr|1$xh-x9|h zK!<@bSGNa0nH1{sex&oci_wT2di^@WqZi0Q-J?rEIHucU+cT-&5`pmEnpPxN)ec53`n}N)aP}K;7c7U= zLW-oJQD^hi$I<|?)}*ITmQ_mk3n7oQ22L_Zo@Q;IQlqU+a#r;peC<{4yn@(eK*CB5 zr34l|s9fMVl(NNMW|_)sRRf5Wavcw*8j1Tx|0jJ<( ze-jOoa`d-z>orhZi^KMvm1W#BC1wAKfE%n@%8MNNQu(^TayL2frtou|gt~NcQ%(f! z9vWM;$uqyZ%e{eq5vApXCGTSqR?ktseoc(;UO()5@?`fD&-(TssmJ)PYse!B{+&YD zocbMtQ;L{*00MA~F9g6MXe0s*ncf2I+Vb9;I}_G|5)H0j6XJz9u>c7Y(OyBJN>m&N z^b9Cm$osMqNOlp3myrleP~d((-Fi&?Ija43yHyV-HhrbCcLN{t`%O+PSJP?Y=S0=>wI% zE8LJPQ5x4Ad;@~Ei3X>~H6dJ6oy+0uj>6kfr2mO`e_t!8x~86EAL49@Nm)-HIMmea zaJD;e;nPo_h!W47!MLSD3Cbja0}R$QZ>fYV_svdpP>hsG_w>Po|BnnFYG!ZuPjWxF z8|RDt!+As8-=@pMVnO;wnFVwfM7RFacPy23q8rQfI7GKg)(C+Jl744_GGU!46O)W| zpAu~ABdk_HqAuCo&VJf~W)F^mc`IKF#FInM%U~wWgsT}1DTT9)uktkx<2;sUIDRyc z*Ia>3Zn)ndhrUP?wi8-DFs4O;`_cSekmi zUi@6lf^b4q^?QaHpQ3!)7e0H0yH&?pQS&yxXm*TtUHmY%cjNE?bbhLM>U&ae+YUuN zPq@v7R$f8NckkoFIg_iU&PZO}QVx4xR-fTx$6w>5&DR~lTjgAOvA?G!Y+fB|4gGK+ zH;Jtn@I1V9+bpJjWF&I2_SQ-$lXaFMwuhEu5)n~4L78IogC3bm)d~@T@mvr6&;&F>T zTJ-eizEcPPq|}EOplp$#HpXcFD{G$K!bbK|TN;)}_;Fz*8ID?xFPuZMFFF$hBxnId zx{`c(A&=XU0VqrG;e(28XM5(opeOW+nV1Xww6qIHoKs-<6s27{?L1uF>{G=fjX?ikx?3=G;Y=p%8uvHR6@vDgXblFvuQ})N9ZjMz_CEwle?TI$~ z7^k&8Hba#X6>%Bd#G?{Qh)ENoOykb3(D+Z>e5Yi1p1L=g3a1{^ zK+Lj7PgeDHDvk6bi+Vy<@8mvLK>lu)8d|m-fx6nddl zsp4V2-ki9jT(lcYRM_EX7o7kOEF2ds=#aG~E?JmulAYi`gh z*@ad3ww~Y6$}0j+NoN(1Vw)lCjJ_fvU{4Ow&@iSYRCyxDT{&LjlX-}-MU~4YI_+%K z>-~i4JuduH!GG_B|4uugVA->zmf8zIM?$mLV&l@2En4}jUb?6StpFmLKM4Yplfz4k zB}t&2sO@Im3awP>#~^RPtW4zFgodOa&AAf#baobmJWbPLU)Ln#0U_Ini9g=vtB&Vk z`R{ysaBorTna*`~x_M=(D1w;gszs$BEt6~_b9sDZMv&o_2C<(oOMy=r z9c!l(&BxB$5tT}a!Y2CGZgnOafeB5+bN*jyPp`GW|Pbtwr*)htz>_Me1 zBlTs9`)+m&Fx;bY4?KJ^CI+E>H!JJOb-}XZjMioAjkccBYw32T$YEDk$HcO`vBo%e zK9NQn%a&No2oO#nLP^RWcRdX@S_Hjo02H1Nx6uZM=2Yg?kGWOc|BVSL!)IIYY`AtT zwY^jg{;ul8>oor{L5d^GSHIc%LNxiyrR=2Jcl+dvSx-znJo+&{c%#y;mkp4(mbPup z-QK4-xv_kSi36~usIB9|uwggMm&PKSspL5MUMG(g zGnw_hSeC0>l?&1ObT?QP z7DBIDq&_Kh(1{+>kJvwmFjEfZaNvv2Jm+eJS!Jcu6G(|SjHB*DluYS+ z*`8+x8osk*J8ylw(<(@JJrj9b_w10C-10<2XOvTXHjFiFJ?AM5VhAG8HQyQLrS$kLfWTtd~vVBYwoBvv0w}i zLzLuBCtGfD<)}`D>%C*6_&*cmAE*(E_UyYNnw(shw;HVG`V5wL(_{g4GL1|2ZQfv-fsy;X9P%j^RPgk3wfP%(gcDC((;$RawlE2+kLFw7!2 z*9NKOud)K=SuRaRz757!(l?6XECcYLyb_RdUtZ(&P=U)*562CYQ`Xy5%>3fY`w6tV z!U*%16K5?$vMVl1m`o+P4j()vMawaVw3>$btGOkWu037X!jm$Jk>tQ7!g{6|eRnV8 z8K@D>iVAvs%xER?Y+VC;zm<|*9ypW*_kw8ZL^vw%%WC`W{m+|6-4o50-wU^?q9CiV z_>l)kOJ%gw*p%U(?SRAhZhqO$)``Au0cHqw&Y5o%jO3p&(9nPmd0izl1^8W_N@2~n zy7?Aw{GNaCe;h^l2oTF2OY!zn8^*bEZDwG46S!DeFz#c(-C~IuP%_hYf97j^bzo;)dQzzVUtRGMuT!uCG6%OPFX%vk+?%ZoA z3UMG4%7|yP7gMZR(wbbIqqizEJfMtLsCB}@l$q{HP-#M8D1AvaKR@!M2<;xj&ryN& z!!~J0z5;sMF%A+apCfyvf@%^^ICn*bBQ1pj){D)3gh-o3H%pmudy9pOGJ5m579&Sw zq z9%B7cSAJxa8Z+kEHdTUR<<9KZ9P8t7=`u=#HA4NGR}sjPStu`ldajpd{n|^REQ|B^ z53tb;wOwowIcnONRg%?gu+IO*<#F+;_D5HRE_0&WG)^w{ZRaB_K8+`#3^%9JL&|Vz zt6}2CL-3(#CC_59p&pp`IvZHgkJU~n#Jj9%fP;hdwiI%v5!xq;2zey7ppo4R{8UyU zGC93){;S$+zNJz$`&T~QWUcqNZ_kki+z7_#PNJysl9bO-N403Sw@OiUWX;qu3b25c zE%t;e$jrT5G#k4YJuy2*SKlsnwDF2HF*l=3bghN6LyS=ZM|h@$?E`~`QCNlzJsO#= zGyKhY4igiT1>1Vqk#^lwnpM=kD`Z94l()s8tf$xZqiYBD<|YAN%O|9+=26@tQ#0;I zHV3%`V8b0n^czWSg!HEKLh9#aK)(FfhRI|G`SXS9@*7HS`?EINO(qlim3rw)yBizp z(l(3p=1z)MnjPz{N|M!;LPons3##il_v4z{G}X^LosFYEUQNz7Er0T#^4j zWBAL9e7r;!3E%gS(L|Ws`)+05@xp%B!6rg^)gD%vFV}v@x>1X6avjA;>>%3z19DQ0 zW!&G%1(L*ijZc|d(KJiLeC}9-xCk`efr+%GCcEk9zpv%yi9nCy5|U{rc=~D`Mt)aK zXDm86mVPw=M}m9LO(Dou!A#jP?Jz%y^ea#qS=oHtZLP`j-j->9@3N=#*n`M$lAVS| z6VRTezCCg;>{ufm{R%u7!V7%*dpV#^6fY0)wi&a8NZGY(UL%r8k}%cY54Mwwo_dp2 zabyo2PPUsQbv1|IUGlXIwO`H>6w0Mj0VzunR;O*k z!!*>bidI{2up>-V#K6EA4zAW4|kQAxC=z+u^EcM6j-ec#6lw(#0vwq$XO z(UNFy?|Gw;k=LkJ1{0SKE;X}{qm;i&YpB<(Eu)R*E@uxLZ-D<(1i+3;tX_>9ZK;N| zY-7D4EJC>u8WhO==FA^;Xta+`fM}`G{$`LbgQ;Ai9t=E_tuU0QNflCFSui4jj}wRU zoChUGV#U2vhq8~aq~Nxel!f9oi8+@iCEk;+>%ET7hab(|^Ls-YYV6j~Bw|0TNS&`- zOuR*wk6?%TUN4dLpN5rvjRo!G(>wBvk@`8lzA=XsI`-aFJ035)K^8a!2fOXSA4a1i zB4Qrf@BBr@#2^y!f(a23GEOfmoLKy_JR6y)o;%6Og$^A$Y%n>cS(R_%fqq~$tTSuF z%D0HzmdNzIICPhR^qKCQ4u99hse8*S>+w8&I!^;QW4Uz@qFOrAfkw0-IfOeh z(sD^cl`0gQk`zNz&B79wm|;t_{%3pB|8JdN5k@_qDo#w)c%OUR-j&LGHMsoD2M-Ny zNT~YlF0y^jEnwUfjSJF)F!EOLdkrHDev(eG$ZRv~3s0$U`$V;3-%6-RX)G?@;(W9s z0kF_1k2}unE>RN6C#oGB&(k{-j3gxprIhC&;u8IztiAlVvp1UM0K+9^=2O)oeM|hB zw)6gw+;Q`lKzxl&aCrV|Ozhb$Qar`&G_p}FkKrR{^tM3(9uGt|07*U`DR8mX_ zI_k+Kii!e4lB>DvX6B=p93yXL0uu3;yogCGh{AqSYuo$pk8_;@IYb6o)~gS>1?^BvROfqZTVSteP23jTv6I z*2>7p$T6~S^)C+b3)ADj=ptzY58_QkNueT3g7ZstG<*P#p0Cu;DH+2ai4N4VQV*(9 zLa!=0KOMt@KPmcTZx(Dp7UmL@H|?z!J!Cw&n>i#|qY4JMqvxa=^{?U+!3`X^W&5kz zhYAa1Y8HoRmnJ70xi1f}w4)MeB*w7Y_V>#tH#(lM-T;JC8VUv1(FKP2LI~@nio6@_ z*|hgJ8hRAuG}Ncs4c76i{3lg19&k$c{SkHJ+Lm4*)0xk1wIK8S)dBUygkp{HuH)^Z zV1}y-H9xrsr}YwpF4GZ3IG#5no2d_@i{(rllP(YMqy{~V{kge;L9Fx9^v9K()o@m~ z72(^nQv6v1NoBE}gJflsE`Pho{qyVvsBrP^HT7s>qu~}^E{3<$`y3QTFs9oT}OIN`=qb zY_-ol(l;){JTn>Xrd1*((GDC^kG^N5*Z7Vx1+Pu`wYN@Jo&Nt`~)&I>SOJen-qeeouqy$>Bv99N3ys9+bTN4^r zY+T#{t|>vb{u>gY0=JnOV?0Yz8{M)jOtnC6L(Or0{Kzp~C)Np~D#VS6^t|NpwwXW+t&*MCf zL(SJ&frVIv_XGFg02juPsipqXj2u>?VtHr2>CiikuLVO)nO^m;5t_;dBTNE?(i1g% zOz(cNz5sj2m~pbgxp|aHHJP+9*pJCEr0h4FDQfh$4wCB?SgiJ^(GM(vH)5z-`jb8x zdJl$(2vSXSKZn$%Cw2Vl=>X-#VHp@|5$eG*#otkG z#@kRsrAp;2yy7Hp9$R$=IUa|i`7r?)tBaXd7TX8Qkqo97p}U|i{we;i(6bR9!^CLo z;;?Ni*nyDKePB%g9&zT7KVa?HUpcO++PJv|-@IE{W`d|&@f#6lx~*Ibc-)DM^#?Ka z4l2|y(hxxQn5OwDZ{M1vKoMtN$301w2?bR1Zo>CBHSA6o%In-OX7c=e_9%}rA!mnk zHO;4#oDvj;9D>WJ(iRP{J2jm#zeMJ08f(fAhNFdwOA7*00!VT%Mf2PQ_*& ztkb5V#^-watxSWbnl6SMlorjlyKr#b%A9@=*#n%453ZIn&Bl2+TAGGi5gBOaJ@wA5 z1yvZI=5rvCZCgYhC;66dv?^@{^uV&91icNZl{Wi(XGTqu&X{YQLy4#Gv8enzJo%G-kjj$zs)a z_R$LqDcQ7W-ZGvj^Zhy{^E9#OPwbEM>YsAGw#)l)&zM@xNQBLk;Mm;PlD&gx_2R_%m2AB))O9~3s%gd{q(kDgv zc&%4WdglA{lp|I2x37c~_B`;>Oulx$KK-N``~0G^AQYd_;*>t-KhXa~=pOq;63!Lh zU)b0sqDG|-HXUlo4i?8bsf3PB)ao!@ZnWOZ4DQ6TB?%*k%9w(TuY08IlbL-OB(vLt zVvi8_!aE$Obc?WM_w5SvuBJotkwF^sl1Lduk`1}>t0ym-Gd2>N4}oZc^_#&0x0kHR z**PV@>cYbCI8is|?Q~ZR>3L%z1mLLI#d%_rB)jK0LvMb_s#I`#d_yS-dBjC-*B~zQ zL%IGtsz!8DhQ-4Gq*{I^v2xv9gKXn$r>h1_o(kpHGcm)7Kfk@s440?p=XppA zOCvMA&}nRyKUU^hcqoVD1eE;7l&6nh`}vAt@TB|#WH)(P_Vm2~asv8u)!0GS&dVm} z+r?*JK!R(aHA}6mcZhsW$Z@qvl}ey z%H5oe6{R2@KSdzrJ+q^(Xj@Ex&NL^UDXWs1oZf|^91>D2ekiGFyOLiJEc=r<-O3{& z2g*^A%#ekJjWG?TqN%mu{sQ@Ln#nG7&_8R}H>qDfC3bbVw<}($44Yf=TBWh(^OVK6 zp5_+W!@@!T9%8MLlAdnP%p_-?MH-dLoz!*hqm)6(cOZ+d8DsDjwnI{a<>9(Fug32N zr^$rgH!*4rHBCSUgORv_7-(R0u-3KF!1Yi_RFPb6;w(MNHYGJF*Rbc9tKiel6@@yi zMd%8LGhD?D=dfEA|>O4rs7EcafG5n`=MtilRr zf4+Z0bNQaCM7@j-(y3Ge`RK1JNjM+Qu*#F~79ej0wH1Xgf*_35hPV_Mby!eG**VDu9_^WY6aJz-zMW@`Mix zdi^B0M46f?(7;eRWQ``@<7K$&Dy_Fl+*Y}H|5vu->ApjB{W;ILUIo1p&?l?l0vpeU z-u)TNFT-~x;c%)b8c$^j`UHy57`dW+*1)(WM(ZJ=c`p#3Q}psl5ouVv^5(a41Ei&m3?5h#c9qmChMga)Qh)#SJ;wio>pwh< zMa-{C7?TNoo4jK+0TctQV8x}w7iHb$dz}?+jA}oZ&hK@IMw3~a-99roGc$AQ?@rld z5hYy@lGV71CePddd3tGbywJPEqQ5NeG`03TF$*&F{vd&8nU#rXOpR$uK1>Vw_0#8t z#JW;iR(bsXpwgt~16ByO`EM)8>!mhJ%i`Nv?baK;WQW`GYNI7IO68%Sx*)jB3Kg~4 z3DC_VGxRXV8cIMuNgDY~jZ;8Kqw{qx4Xv=E!Z|!qd%U>q(3)vC;1Sye#<4TaTRJ&K z@}-J}&4{?2MegqB3S$uo{j2g}Pe)__RC=Rh(K{&~*Oa{=M3eP3{tCqBgT;&V8WYxw zL*e1Q&l8w6utFFLl}G60IWPp<2y;~P&fj&CZaoRhgX1nd|C_!1-y3`pia%k*wygXf zm(#ZW9sT7W&7!U{xvde`Qr=oM`9N9)8llZv6|kqQ$e&o1t)Ch^OFE3`CW&A}sI({U zOY2?)nic;Ivp*qxDrUKkdxZTBr}{4Ur{B}!&t-KRhJtMj-2%LmSkkDq^xaBHUJ{n5 zD1G}pSE&fqt8O=NIshb8icF9mW2e+2N_z|c6*X!!zut~7eMI}P{P<(We-XKYt^WDM z&_B>1ie5eU_wb{`9p0ZlZ55ACqE;`fL?v-`os6z?~l2bf{>dFKwnK2k2~qwq;6}^<&#FKV;!=xoneS)cuc*^X*0`PA5(q zQv;?6sv$Y7fPdegT~?S80&*#yIw_+bq8a{LiPnFRAZuuGki!h; z?=RxN@LV(`KlUX?RlC8?7zbNF*Y(BW3?1yJ4pFR90AoTT2k;z7ymqu5|GQfY++*H* z^qY&@aOe4cgM}h2p~7~Zm*{E8NG{1Aeaeq-b!= znNXYYp(|7U@#E9d-$wrwyZDb+OfuvDK#>?!c=V8lg#YhPDjf5R*!7f}%!^~l z$A!NyfBow!{#|&ukSm`m;)j{)n`s;0jkNy673 zJ+8!OsXebs@WeADA3j_M=t)^)5)k|)E22*Gm|kKX&K*BIWhNuwqaPj-UQ*4w^&cqf z(*dHQAG3m_09C?gAtEJ2NkIt@(?tpWo%Xher|}%~!`vv6LBW$C8T`2ysm>^2GZAcR z@V8gV`2JB`CmTh8J5YX(NNGyZSh{qO`}`>&P^oGosUkl|Y(DRkBI zFQKQAZau_hSyu$~tgNxXtRIjUU;)aB(`}t)RR6t_;Ze>vtOHz6i%2B`nqaGW>5>Ab-qI(I}0thP+8%S^k;;|6Gy& z{aFA14vOggSHm;Ce)7&;UIy92d$Wh22?V;P) z;APJ+6G#g4Ib@J2zGbUA!Y3hOEL0?O+l%=c89UqH4kX2urbgykO{c$^6!dD@=g9bK zjb^~QhG6DgQq4yPhF**aCG{jS zl$s_{Y(eL%eQlFXx1FQ{dlm9P3yS!JCY0^;_@OYlGpOt+5nwn;(p9&~>a`ho0BLk&+)Yz0*X1=+(=u_1#3Xzz@6UGk{O=;+a4{H4yzzb*{Yci+jVum9Jh0JrkKBek$w303r>GS;IXb#h|D?X*pP|(-9Y!7(M}gTsgouw^k~xs7XLoZrn%xfDqXubV#LFiIxLjDRj`-39~%nEy6Uk_(M!P8M^O*Oo0a z(4uX)(1%s zzFp`-9-{MZ+Bdr^ikH>izjy#^byO*0eJcOVRM9vK7W2~t7<{FGPF5rTJjPFXG#dD&^$ItE!UuIQZ#$E*heq&q$y8k%&mSoP zW(m{j<1P&<3>!T98GRO6MqM=R($!FZ{br2u@I;O2?T#FxHlp%{lJ^RE3u{PR+`P=6 z13|s+8q)ImX;>6|n$Wvr)43{8u5NG0&gz+czE)cYpaIFFp<VKO>e4eYfN#(SeYf|@{%+p~q-AB)-0D{4rE36ht!c3pdkr#JXX9D+bx^(9s zJ5@d^e{r)^yHh8>#Cny+sT zF&zFV;hd7OUb@EmHelaoLYtm7YD#PPwi$xkllMWuWB78{bf90_jdLGAz1yjn(a3El zh(PPDV6-GKU7V#?JKBWz>1`)YELV<{V@>cmeM8UozmJJ{PJWMOdgGu1*L0dQr`|Xz zp4^zNGS$E^8~#0%=ejR>x!M&|T1#f3h_p@ZUqTELq?D;r_$z|t3VEd8haeYRY$#i( zbcHm1GO%FgE<<%uju<-dK}AQ+RiqFp2u$Jlti#fN#ja4!YT|;2KcR-kc;a+o*3~p^ zk$Ez_qiM@8-9|;NKs>>^EZ?mKCo?~s`{B+fy%B~5r5yIW2^sUVEUl4t&kHA)WZ|Vd2 z7Ce1W9M@;oQ-2ak^EKVGeRqp7orWR{6dIZA9*Cox^yA$3s)ICN|NgQGygX>LkR7qH zxw-r0+#(Lk?yDQ%VLQV!ilJfz%d@HQov1m)Q^5^6?sAjM(}{=}Myxd3G!N8=E^CoD zoxOtzF;T0X?ayV;c+}J?TyF8k8++`Yy;Cnwevs5ejn#ygd0N)H0}(^worF}%>8clDVAA=Q8FS z&lJcIoSmOrv78T_=(#(OY0KYGUq-{I?}tsnrfXfQp|9!Lkh+ zUH!eBz7Pre3hNvrI|beAc&zG_7}P`e=AatyCZVqBSJirl*?g0-2V&Yku%@p|WhN8( zd3EU*xXY6gR(#iA(nBUSOPQ7A`~)j9j$M}z_tTMi!jg=9;+S>bNugqq?wqx1%+<|% zZ2d}vjI|q2h#8pWR)~0BWHlasocknoGo5tpxqVTBIT||sX8C$lGuFWpS4@D-z0GfZ(f?}MG)_3&xwZBgy1KGEMhq`p+KC(d5rP?H-gbA- z#}=vd=uvf zlx7!MM+a5*`mQfaV^+AXzb7WmB6O!(!t#*jnZRh3h%?9uD>^MQKJ!TFN4iYKlH<(wIbU&&JvV^w91 zv3R~Xq<^)sVu)=uh(p~yaQoBmSgm8K>uJcfq+F*IVr$2yz>1j)N8dWmJvOC2flXq(TIG$$6N5x?suvYm06Z2ItJHR&^0 zIuJHs*?}95aI;T15#bSL%fn*`+&p!wB18y+;Q0vY6!r!nhlh1iaE0dWrx6(-f^smJ zX5P1j?WB!n={U0Pa{Q(ABi?i5g%w|pu7=2t4NllobL4EdOP&JaQM=d5> z^Q&UZMX}eYgchIZY{(XgGreC$r6x5z9@Vt5?KZ0EUI4w<7o}fwz|FP)5tj{LvM6dmE44MQ?de@-cqQ}Sf+w-GDM4!GTXDTIrP)L<|t6^Jc=1^&UclpH;NDc^m9X2LX{m9Jn%1hUS^mDgStmBHuc*`Abl1CT z0^c=+`m{Q)K6!NhDn2`7Suer1G~E)aS8-<@#jCxEHC zr&>k_@bvX^9NrLgxgTaO>eqa3_+T~(iXUynKpOiRCIf@*N4?Ox- zV9geOSHzCAiMnVsPiRynT7Ir6syq<%dV@4?3@3xqn|f9yBYTX(Z>ikczfF$lcX!0- zlBbOQKF=TddBWyf2YU^CpEmER%RBLV{;fe$v)xddM6QI}#7aG_ypuB)z$F@i@P(+w z07``~^T&8??XvdA6B2p+HAgk-S=b*U{j4$d+O&0q7>Hb*u0S&J8hAVxJz6)7tNG)s@eq z+YMk-B>d+rKh(p*m>nn=BnYQ*j*fg0T6*N0={g;6im5}#&War!!=l?f-ZkP9yj$%- zCQuLxv!Reib6fvG_C-d&6%C7HETF=7Ozmz_2}$Gqqi`kpE2l(f;z?NJA||L+>Co^v zNs2NNxdf|x6k|M$pik}^XMaPhjlogCa2jSay-R7f-aB&qJM;b(ho!GAZIvb{c{meg z^aWCW+uCT7X4L~&@sGUrv37803DQKi98ZV5yWCZAsUViM$trQRG$EzJ)N!s(X&r51 zT*Bz&@o6zW=D4o)oXgdz49D*P;<=a75R1@GABr+-qTMe~-|YOc{Lnbv==C+=|E!6`z^=jw7(25Z=E zH}m`>%-OtkZ@6?03CVx&h-|3ZB%OESSGjP=`NrLy907 zDnU&YvK267Rul1DFjl~{$$Mh`%NTvv-=Ax7Tls5__syERjDFd@;8Cwoq!G6Xq6Ab> zBl>+&)%0p~(qF3KRY*HI`yx+g^#sY!<7v%p!KL!T{1UFnyR6lqfS(YsLp~;EV$6!i zd(+N>lpi*TUBL1Jax2Rq`Mtgw9uCL#R~67=G0nnTDhPk1-EZtLiZ1}0mem{I+{&Cu z@%Pl$?s{G3F~r>);1)XKu3PCTZghSb8W)-HjCsOXCq9dL#t%0#Qt2=~DEdq;7 zs;npfL@q->JKv&+5WmcbIoa!jm7T2@uvFZqjlHMjcyDl!O@TMWkC7<+(j^1HtMeDX zNpxTWX>lEY(z?~sDBw6L^!Yq}z1Z(WN^f8Ai#f{`s_OQZT-pUktP~hgpjF%IpRBbhZ=;B`-Z2m`*@N+qcky7`L%e~h3L%(Z8+dJ zq4e~JI`XE5`5OVb^)Ty81mit@1AOmY9Bs# zGuOV}wV`~5|NONx^lC~wo_42sJLhb(X0>twS7N+ot2y=>6YC>Gm`${sx_KK-+1ad) zr{|0VxMCY3c$-?-q#d{YZih*YuB!nGD?7pYL}{n+$UnhY3L$K)<&DVhM_VDO3<+5< zq|f^^TBK1J?TcH_)-yZMRT9&D&J=xlWmkPYe7zfbjYCYeFdflua(h?VJqO_6D;50& zV=Il^|=n}Lab|lSEu2BHQlOkm%X=L*#l$6{xG6 zXNt^QPd|~9$1l>T&rsaEo~;s4O^|4aVY@W@S*fxsdv0)RK~fTjHFk&nOL~tAm&|^? zB|$%XneA1jECi0-mDYYQy-J3kg;Z;83@bL!qqv4s=4%~h@Xx)Gp1Ex>r1089=kr|) z5_Gt)*z6^(U?aS^UZF@>@O6w^bQwyw@YuF_{{gW&@@*2wp*y;OGO?ld>fiUYFr_DpKC-S9^^nDvWw<% zL#YHmS4+9Mwf9T#ZMsityGbD9b??t>yDWfj%;?$2_EGw&LiG@>bK5s7MJm)$Lx1es zScokIO~)x^sMlw+S&$xpv$C4B4cI}u6GPu=IR)hnHbJ$8cnZSV zvLUqATyxrJ0ys}Cg1Th!Fvg!lD1y6pR;^-n1GJGgKTLd=z|VBoWHTmTs5t6pX2P|} zOgHx~*X=FA45Y@}c>E%HlTdrW8)%h|p8VK~WFsz3u5l%9%;|Xpie6WaH2w)Ohr3$P z@|4L%+GvoAMrn@v`*-Yya}8&&3VNAr0%41m0H5jVxYLkLKCYmtMxeIT=-N)o$1M?k zGCibH{)jdRE(U|PdMDwhA5A^)iN&lRZ!wSV^Om0zykUN=e}CbmxambMJ5Xh9T&-TN zkW~aqi`D)b^BlP>U#Ylc1y#?fvR)73-eT_VF5_);9=RuqsVR zs?Se{>n3|O2p*DcJEg7Y-C39N;53Y_;}b-s>M)V(4|kRb8VV8bCB>q%%~TDt3PGkv z1^Tq+G}Dx|S(@U+L8j>FPSqux3;tsH@!uQ>mSoCDUnyuXq?J%S!6U6;N3vOc4Z7Da zJA{kv){(HLE};vG$lvcFyM@Z`2TnYpz+_c%tKXl-c8EMX57qdNIRNt=jutfr-ON4< z^)fi6nn8IYuuUDc7H}Uop|8*e4cdG!KPouOl&E*RmSyTp4!XxFm9gT-}xlbw%Tfv z2-$e{XGgEleh7ke^A@+T|5H@q^q6}hJ(<@u~yp%2-t zZ)?<2+}UvLUc^#ULbq={3U@hJ1O($Ut)LRU(d0BEDO_=-;6@cdz>B$l(YqCkoLyS@ zqR33V{|8%c-uIACmofz4VNysL+zb(V7V{fI!qg3bY5EhD|0*mOh#s4GTerptNsJKe?Pc*FP0EG-J0m-TS zGI^**o#@5XFxkm$Z^@wUR^f^gs0zSMNKBkqnjMTdgDTqj+H8_+HRBp#-z&hNK$rGu z4MlP#rzLY6(itZ(tNF>v9`=`cMo|K>BlP!hv^ZJehV-$TwGzb4<49JatNwatOOhAC zAN^XJ79?+P9!%uZovNqU(O?5Q?`p^~6tpRLBMNPhg5|&C5D7@aex=OdV-0rnn8}cJ zk4@36%dDSa9SNrQG?IIIWqjx0DTeHrhIOnDRWzv>#P4$_ij59D6L=1On)Eis^toQ8 z?|UrPA@V%u`EW2=Mq^U?p>wIh=y@Y#EMBdn&a;%dEK=W8;g=5PlnxZUw$qg2vP&`9 zM5O!_g?E>67d*WT0II>5b_8w2=GhM>T5uK40*hGTv^~T+bYJ2N5_epnukc!92NNJB<*8**3!< z?B@<1%Y37oylwS2~c??pROb_+NXt zg$uP{zWRNi{=^WYnq7_=wUTm1R$|J8jOe!lh<#XC50nTH4e zq$RP?j|o@8%_@DR;^0cgZCm>qh(dl$nQexdvn~7M8ECHzLyzhCvN!yc4z-L{6kMSk zDgn-!#;v6+Fv5mVs>^PNLw3+q<3fgekM zQ^MIN>;9;M_Li*ih2svf-6mL4>=z z8WH9fe~bayntG+T1hD6t`C~WmHriWP4U1@EpJG7YQ3=?~>6L_z8n8&%f6`0l=H})V z>Kc!xw*8FXre7VFmH!;6Qcc0HzlK0 zw1zY_&H{Sm$qTPG-`aCvjKjgLvUIe)L*#DDbP-9+0FU`t?mjVMUf*oyXr)6Ubl8&| z@kbQsSF2QdHS4Qd@X!7~{}hEsiDn)p%wmhYZphfKx=A^d%ay}60KDhej2Zm9zltD= z@ME|xpXh{3JAr2IXaKLQz*xf(X!+1aNDht_A&14wuSZ=_YA#Jmep<^s`wbUWl7sUg zvkhg?h@M+wsuOR9?I16?>A9rf@c!-^CVqMNFL%hiGhOn0VvyuHB|O-g3mJV}J4b_x zE-sdnb>@+2qcwKQwEa(Q#dry-RwL6gE|>5sUA=Y7&b2=LhB~bAxq^tc?A>EJ^w={u7YayvnV;wrbXLN#sjcD-JWh_ zP2Md)ALMnbzU<>72|VW}Na8UdBpD?VKbR0W3`C$_p=&Q{hi6@3lk(9$)1Hy+?RMY4 zkDZ=q)q!&hnsw{y{<-10JSt#UUDswP7hAGUgfb~2(}2lD2Hk!w5*iI%Zl zt9$#_dXuT$mq#6GXupnxLpM!0Kn6>arc0yAXJHjNtrUIDO%TtVFYlkD+$Kz0dI*j8 zc2Mt9Jra?=&!&#T)U-5Dsb^lrp*;KmCq+gCZE{{|Y8{;pFYJf1G02F#h|I#0o=I>o;Ff&dW|b@#ED;ADb=SdDDGDew_2y_ekmJCudvCGAky%K_ zd>JTamzCUt}&>LUR{*9upzp=JMIpe7`C7aGv0*zDFf$ zXR*v`;3c;Hw0I(`fpOx+VU&!0Y9_>V=Qv@aLdhP|kCotSeM?-HcvNFK2Ej?vospG) zx4^()X&{_mkT=NB;hcnuc2?7mBAV+=7Cat4qAT$3A>xVHa@0xRdnlKGrEM8zm^k}%EnoWv# zT+i8vh!t6ZCrMjvsLBtj5WM)JuL|0R9k1w^#hMX&_VL0F>*#Y}$0ylEd*k+%j?G9g zuZ!G{%z(!+)M)1eK)ZwbfO^hefQACh@RlsRy=zyzE}IdM!tRP_1{Ib$h9`IPnwFU3rn@+J$ozmm412isqSn`%9eb-%@UsPYD`MnSDGKp8zIrrV zrvdT)Mx_DVb32iHC`v9b%dCdoAcu+x7w!>lFz~N_1Zc9gUZq}mK6J(?8h9mlNc&+Q zFBAj)h3GPJgv_7++d61dW9V~T7tkS!0s>OBo+ z$jB`xo$>HaOgpKW`NhqrK-eL4mGky1a~P>tZ$Ou9jU7VP3YHz=NQx7|8;|YgI(p7< z)z81nU2oY&MKe|`9MMn*UMntlJrsf6q}A7d)-NRTHb&}cBdowaf2Z#$u~$3IC*lSN zPT^p*zPY0Mkj$@kx}>z;(l@o~vt7PyH|>ZnNwAQ~ELCq7&y&D}ij!(~pT=~zXlI>_ z87&q)dI!F!S-0Qo4i&kSwPoV#EN^ zPwNM}KZ?-yNEW{)T*24w8@u%QFbA=QO>UQ_>wBz|7f`GKt~ln#Ht0#PBD&@CB9WQ# zdc6SNj;}u9uB%Ir+=AYDd3$$npk?XTtx9kj|8M5mJ2+7MI|6_$nwYp!- z^N=$IMI64S2P=__Sd?s4*rO}$*EX-f{JpN$PsoHs~#v z)z%_=T^c`0H=OsZ)V^yXwtScN#o|$Um{)!C9X5VhHgTOT!{ym=0vZ_CB-%ur^p2=~ z<>gngn1LV%#36Io{Py%Xdg%C-+k-jZ>J9ZHrk}-`O5e9Le8hDjcEvME=7d?`@(`1W zSU}*dIHzXgd{b-EyrZ*chN>p0b2`?Wu6|kBN&I*;az)!JEvd$0giW*L>k6OyA5l}{ zCiel=?KjaNCNl{KhaI-ag@XpE4EHtS^%B78yCRVxZiOqckeBUvcs5GIk9j<#o^#Zy z=&bm{m_4^XZ0OI{qk;NoUdwMb{D5L-Nc)`O2GV2p%Um~3<>T|?W2TB`+oOYY(#1=`V(3!BXt+;_FZ7&rZP8cN@A3mX!8W}>5NFzawyra110Uxd8;=&^6)=ccWz zYsJ_UglMl-T8VSiT7F}oo{c%zaNsppdA1iRXXGKlcXJzVpI^ba@~4XnTpI2o$AGcf z;U@PY1ukrT1BuNAyZdQ3N}|TRTxWu-_SxSfBtD=Ab*S7N_?qp+-0#ul4lF6DxLE^pWcpy>;MqW0B?HZeS}d0Fq+wH8ZUf>%GHTA zS|7_D@_ADJ%Gk{tWN$E9%#M5Vu4w-)!x#v0+QJVruVLgjo!&nKUCA52?$CQHR63*0 z5d9{Vr!2u&;=>@3#(cb#@)?;%Yz)lC=^WXQn}7i?@HyKeGhc&!67v-CU?7+8FJ%xK z2CVD?nc_B%=4E>?NHS95lx5zB*ljfxXeE#}Nz9nHKEGp1NFMs&s#)NT7`S$t+teb? z7tp^jsRW%|fugH@Ak#EM?5V(7X0FKE7d;4d-qY7=o<2GouF#}05ii$$=pZA05z@%V z$9_G;V7}P4z)%XTW)*JH>Br+}uk4OjpP7QhV>}&xo}<&gv(?o)Ue+zY<$WGizj*t_ z12yYS^4)-!IcnM;*3dP(vC;DUwTK)X$>^HOiKwfSp`&x;gE}tr@RKq-Qe5{-E!Q8xQV`Rl}W7ZgT!qesR9X=BI-_n6j-} z-+NQr4QPJgHt%6fQfS{CE<@+!4;+QYCOPcuDVK|H$1iN`HE2NJbp0#`ut1pBI$y3& z^g1ESn{#J~hm_|8F%nq)!O%CqSJwAuw`owSx+P}w@zerWd4=#4S8KDa-^MR^Gy3M$ zdh)&E^;`Fsjh}D0*u76tIK&>USuP8}Uyw@s{e*ARR8VqG0S&C7_Qy1S6#_8SI2tOC zD>otiVH5U?$+4S-BbMKrSVXJ5V!lC!8zUTybJh`$2dX;%F=PmbA3nk&>9Bo9JtgvnL$%EjL7YY&yr=6Tr*RHN{rv@ zS$O_b{AB*((p3$!cJM3KWV)ofJ4PmTgO1mm0zC7HQQa?XjB|;1&Y1U zPBXVC5I`+wibWe0``p+};jS)x{V|qIb4Xb3-31PX1L)Ej!{Rq}VgZ+^h6cz#Ki(R@ zWRI^}&7S_J=^DWsI_gP#Sr?eo*fglsY>M#1cO)$r!e^E3LO5E5${!mD;C$_m=uEJi z0AqRnfM3Ma#adc23Eog5Hq3kBQ@0?t3F6a zr%OSmK=)!fJVd+W$LSg)koh-lojcnuG~SQC*DF7dE!>`aLD^!TlWR&t$FtC2TOohH zS;|a7=YNd%r5;Nj&+BeL#7k%&=n7*|6Y;Kze1Tp;Hz(-M+AB5sp7_&268~Jb4J=|a z-EB-UsPm4HNz;n#ry9HIyyfwF6XJLtZZFHDp}g#Tp3d*VsO))0%sfyS8>NcZD+AcC z#Gi|#y`C%>c}|Z?>9wcSn}cun8Bn=rY`dFYItDr{Hbm}yWR`ulLBu(l=`SGrYs#^# zcW0z1Zxge3v8lOPH7&-{7?1chk#lLaO_uK$jAV2_gkVjsYxVcGqJJc=&mRgB8?4F`D&-ED?KK+>Zq3Y_-L7DGBIWdVe z&i!((RMga$-q6;x{mXCW=({CYJ;aqbEQ=Y&W$&#Rc9W@M9=hd4J)XXzu&mbY*T{dq z8)>EOq?s=&raV2UJVogUi*=6i`YcJ-2!&K1Zmr80#SYOoV`U|lQ88B`Tf%_ z_vIFZ()6ZYC+uXm;;vlKn<-^RkJ6fFEPTuOycr$wr;p=sr|A-YYn`N}QF!8dpFUKl zdDg0Ku*{?3)B3}2g!*ISK-<<cMK>HNGAhIG7a`=Ew~<}rC8~$> z1QWYo$LnN{ZBk;MgR)Q>cZVQ+m(?uaiwJQJOvaSKQx<2bqHbiyWo z@K5dc#1z{472gWMLX1I?`%9{I1+#@Z^5f=fxw8B66N{2+LJXB|cEkoNOtu-2i69p7 zEoersg_P~o+j=k>c22X$Q6@=Sze3i_!a z95;df*)P`>e0sH`QoVYxZ`+%s@LZ2;@^jq*GPxnA$@waqbBbj5Y3$wBSE@+JNHhlJ z5(YHvIo%3}r^GcvO+%2;ON(WsT)fu3iC@w0ye13BRLMJ;{h9RLS-+S}>uil0yt?SX zX6@8ypt$!O)$hRVlUQI`&YseebSZ1Ar!)AU=Z7>+^FM=#Tr4K8GgE13_; z8Pk~C1~a>{iN{c$@AV8CP=K%5IcoIXh-r56cnsOtKBy@c_S@`+!;(a+&|z&nEAFk{ zcb8f?_07DvV87M@D@r!eC32~>>8+8}L80@dLlzqffcoPg@#^P@@WHXz$HV2@t$8TUe!j-g7Q4+18KtGCFxGoI6ghhFqd;E0peD7H6v>;Ihm^*e*MDv8HTus6 z)&IaMZNNkSsE@Kv9iJKVNI!s5&_VhkOClDyBhg!aWjt`zCvvN?Rg(Xy)e}+Da_s2n zczm&v@w{O?A6V_;V;-?c4)qOv_dMPh+we#8r9Sj~_uj7Qi2rh}G+B6Lo1Ct>3^1Ex z5Urw^;?{jl$H@tHN{wjCukZgv z!agN?@q)T56#M&8Qp|S-HDN^J;x-$-I17^sy^z6rn7FYlrb_0^WPtgX6@uRnaKBU> z!|jZ02nYNaBGBSNMQphhQ zc3~A-TNh9tw6wU?k0Y5rXZbhhB1%JmDF8~x4<>5mfL_6gUhPnmC7nz3_YA&Z*@?mU zKIaFjTo(T``u?$m_;sl_jNCw4g_k9^{~wr}4p4$GsruR>CJ*=VuiNZjLggUFC~1G+ zGZK(lv*<8s``}<%1Mk_s-}LS;tN*-IJPcJ5PG>w{@=Eenmw_uYnbHdyqhs_G)%4KM z6k|c?9c7T-I6(2+rTlh_^G+ZhpI`j9WVrS*$yZroadka+%+3Ge1)F<-M;Wy8eYd(3 zuk|hE#xombnKOV(L&YzWzTcf6{);5+ur%_wqYR9=Kgox7O4IjTWdmib55N8r=|x`f zCxwPH^`M!c(r*6$4Np+r4GC*PbEyIk{uhJuU(fviGrRurN3L(8+MgQU%#=_+e*C!3 znv#%^g|{zN5|DxdSOoQN1V22i7UoZXiq~S{AN=3h@Ly-fe>n!IZD0H8i9tOO)k0s1 zfuRAS|3lbUKt-I^f1ksDyc743=U;8GSUNj9M@xRn^6rEne|`&UiHVLjVMi}SkzICs11gsV zPR=?6BMw&TIh^3P>9JjyBicaW_ z{M}x9;o}wK<;LtUmlo9j{d4!Bi`kIsNkHdIfZYcF`r&rWL!A%IlVyg?ZTJze=gJN# z{&34o1=APm3`!LJ@snF33P58G1$v;IY^{(@jw*~+R#w){<$~`0@cD09uaE``_C;e8 zLw-ka{#L;IuL{d{iXh^R)`am`OL+wIfT2P(yGSGxt?OpB?ssx~d&HP#L75Auf_UoV zw?+FhXBywqQjw2@9%=g?(6d*%?u=g673OZcPn}ncioE^JC-+M+kjuU0`{L7ogNA<- zfPYCyP}uE2&y{zT4B=->DF9v{ayYeb_-}GMozW;|6w5lAeASmJ%fSD`#{Cbh*Nqh06y+0 zAv_~|`}S?!owV z)@&d-z-{dr{8{J!H^ZwJWcMzcgpiO3Or}hJcDQ&aLi1=^W-<=D;@McldbL#eTg;_aKFU_NfaDcAcPCoq=S z5+S+YeJV_WwnGQ9@4Ek3nEWrX3B+|Ew*l${_i4nQQ9wW)FMPXLCU2ja9T2Pmsis8t zbZ-;x11$xr?2*h=Vs0gLk+HzxUK6y;DKTh#elt8w`4okOD^8_Ba`pSdAMx<011_B9 z@hSneAKyUz694i|qdd*-HoGdSUeH z=kwheOgRmQhP>Qm6DEeMq`$)l8*{GkP;&pRvVRL7xPIhz9jJcz^O*hh@5KUoL@q?O z@(PI=+;rm|6y$Rt{qvxWTyW2x=>FigxmtAW5y3%ymz)F7$0sH^d8sndM&(m#9Mjvk zcywdGe&=k}-!7n_$SUhqol%#rJ#iceGs}B!SO`_5{e1!F5lH)RwZ{v!b%%q}a@U0bP zT;kv0{v@n}I*0DATN2Fr*(>*vdPHVNtX2WmWyJ#HAOEl=^oasH_wz@3 z(CNI(L-K$BwByMw1na$%>ts^2^k}xU%2D!FEc#K%7GVDLKoCgmB8eZaaC!eOEXIHs zW_zyn2*ls;4DM|7BJAs%JO!tKz`B;_R0UsZsaJ1i#uDFTkA0UR_MTLdbIqqR21a9= zDWF(mGWHIKt(PPI`USo0zb#nQ4T2u*kfZIs!~-`P>T}lEa)HQ~%W<2I&8k!;&5q z94xkyLpLRjvzM!Ttibk|J!-!{%Ptewv+VJHoGe~vjHx>7x`4?u>9O1Z%es`_LQbHW z&!B>M`E^_7N}sN*I@aukX1!pM*MVQASRs1{h-JLzW;T_$%mtnf1_jR_l**>+yZ#eZ z>;C}$!Womg0(}lYIZu9YEqKF~T6r6|UB3Lt|u`1~09ik5h)x5VV{;NU3|Fzfc zcv1_&+V8&b@@;ukS?1)W=q23SdfGHBVT#i?JS$e)++VIA_GFy$K<*q8W)MQL{8`26 z=l9~%$NBh7R|t86CN8@W{LgIu;TJAk9LkhFqpw77lV^9?rMVj{-_-!^nrF$8bz4!g z;qk{?*c+szpW54{dU|`klCAyyQ*!e2feOlKh(Rf*+j&| z1W{fC9Xa~vWQCu|9&0QziR|~MJQV(~=4o}1hPqw1vi!Y90pr^iuYXvEiFY!s)82c* zU=!M%m_-e$F1S(la_i@sQTk;cwE^H=j2%I0#16rt@o^J@yyW>RANQv2{kP(ClrIszI2pETQFA?R8%;$Hs>#}lvKQy zQQO?{=r&W1NU`0-JCq489KW9_d@_jv8Zi^b<~g^Xr1>8&c3bVbpn?*!h+zT=oBP^2 zH{*40;N#y>P_}`+r&{+-YuajJVeUPC`lQN7t;_+D%IFsv%eubKgx|*`o_*QLFsbsf zvB}A1Y}44!J!_bv@>hN7A7nZeWk%^tku9$MIr7--Q&}ZN-%J_%#%>ncgwi6j116uM z@$mPfUOzgEn&zudlJ{`pOy97}@Plo9n0SQLzr&ak$2QqzERvbObpkzyxqMu)c|G<& z_qVGkN7(GwJ-lm}83VQIg%KNjIaD@jvJ1pW!-7aojkeno==-*CVU3o!Qp`Ro@!{fAGTpNqEwA@YWFIrA z(okO5~J2q|wvc`WOn5adg>*+qN|_t>9s&caM&n%vM8sifgPpRVQq8kd8x; zmtBjQwig9O0*oSTPTfyA16&&%1Z>N%UvF%;o&ES?CJ?-_NXvg=I?r}u7Gsb$X0j#c zIZ@?J(MWMq6b?p;*g+M%w+wchnrBL9xRTLDz=)_%XgHX3is-MM_*PUq5PYEH*QA7{ zM;bSJXquU2G&KNC(ARVI=7z1)AsTf~hq$PLLUF&ue*eAB_qlzl!?Z5?xMWgSuKxWc z0Ph( zi=vw+5)P8ieWgQEN`Yve>6$!FjYW5BF^yAw7bO|3$i4&dJ(n_3CWNIt1bdP|EmQ%G zzN+E(O!Har!+!OqhHF`wSKl);7)g16FaxyuXcWJZY1{O8!h=SZQx~2Y@H;OXy6?O) z9UqCZlZV!Xy(U1PC$=4*9qeDn%wiwp%i7Gw-lHTs5L5_rk7uf9>Q;E`F#X_-BXk#6 z<~46@Tr>z&7X_Zp6fve|&2rKq4>ZaETf<#34m>plnz<%AGv-c?Jmc}#HjWxgRFkc! zTNYg;TO7x?5bD(zI}ykl>5c6dH9DRmf;>-0Ur@e5RXc@rtS8Xt(VP@(+s z(*{f~3Re4Qj%b}DRa^2Krx$9wx;xB}i^fPL9ZJ8E>|A5MCG{5fWGx#iF#17HZ4?Bb z)S3A8{UW;NyyCn^OF2=p_zi>z^jU_+Y~be21muLw%?mosru7Eft zuL~e@9hv~GAYX2LFRk`rp)i`2`4wpC0RN}q}H}OC5M0p-vqYhYP+FseJoiZw{1K%^giF1P?@X8-w zI`6Uu>=)K^#xME`w7-~LbVJtI{!9k(`6U?C*rY+mCMrE6^Ttc`JNh|J<-68DzrH@; zpUH2eb$D)8<}jI<__|1^7+lF^g|WpXH%$x6Y`Ii3v2h4IvJXpQ@tFN_uf<{cW^K4J z9J5*ZJ$5~s?qG|eYzTU6K{wImdClr|4rxoPdU_J!CHH)o2Z5~%e=GgACC!|lQdyU-$d@RKr~mA9(M#CMVEym?iA zAmxedN*ZXo))iZkc?v###Cc)WiWS8=dJ1$T`SoQ~UtcZmNw*y@xJ4a+6^4n~2FIU^ zsd2aMmt^%P=-DOtq?x|L+5%D1`{z9FuC_y>Zq1tXqF<|JZ_*@>Qfx z(VLAK>$Kr7J|N@tG~Y9H7>~Om(rt~2(l)H2l~Y4QW4l!X@GIetrVavI_T^{td*Gqr z(a!3Fkz=^Wscru0$LioeZ~YBzxuE7-0CSppzXZ2&qtU_o1pi)eK*3y#8uzmP{JRJI z|J@4!PY>DP4krl2-dd>HX!4>Rtgk`OEfm1dP>dI`RKIl{>v&5 zsyXW|uIwk3^V!d8O{p!kNHaI(nBa$Q_zK1?s~uC-2Yb#eh2PvEg8esfei~&!ngq(O zYvrl-!CBIkkJB4d-SX1>uvYUs)%g1p(}E_vK?6?_mce<2Y=zYBCpM6MtRa`sYUy;5 z6=ALO)Fhs?A)j;JJls*Q`V(xu^Fpjt6n$KrCeX1XCojLzj;o*SeUc@BlH2GEM!*=RMgfTv@QmZ&kyO6!Q)Pm?9a`G94%7P3(`=?I@&ZKTWvqCC1%gK0H;3+G zg|W=!rDhCCBBq9^PkGy6iZt=Tgrn_jK_+u@hVw;03%~Om4#ZmSiOpD1Tvyvg-syq( z*u)~96x^vcybJ`DzCa0qr3)#4LJkV5kxi`%H$BZ=o?yDg>^5L#}x-m4DZRU2S+EYbtag2C#3-QBBgSszO`Hpo7AcN zHi;CEU*y<%v@7N?b$!hFT~qfS z_r0x;>E7FBZ$mOPh$4{l^yYs{&aHDseoTT4 z?Yn_*8lJr=($sVCB5ub0tm=ogpK+g)f7-)$b5D6?*@3SL*gLTvAZz<4uZw&J2kJT` zflhzJ=W(%)PD2i%D7!D)LQLJv#h(jRP+q7qVwEV3s?R!-PrdJLSVv5jwoX#lI!(HkLNRCjvK7as&5*63f~sOEk<-d;Vi4_n6J>4Y3%A*Tb8ii zh{X%SCthc3_mA!XtAl5p5K&m5uVvkmbkk|=tl5B^Q+>H z<}lyj2V36Nv!_K9_B9SCig2g4=etYM+|AQOqnkgIoxvwn$JwMZ_G{7R5Xa@BBSKbz zW1H8FEc}k&Rf$PSwN?gc_xFa{Y$lH+Phl9>L!(ljsZB&}HI6$(V`ZRn$^SsN#E4VW zs3KLqUZVrQ_57rC+N_AvgtE!0t-@mEC1f#oYLodN!6Fu?Qz@Z@fv>RV{Y_$0Jh^yL1}!rOFDK^! z5x3zygy1WU+2hSQ5l;s+Nx5-?pb`|-NX*56$b^UvnAU%rP!god!>H)y!>1QL9V_<) zt-0U74~2Ryr-k|)ZHvzovoZ0bPY$i0RZ&d}jQfvt#(RMiO^~XQbf|-qeEMafHhcJ> zQM;<#JO7vRyH<#``ULQ97~ ziVPXxZ5$_BvRNUCcrR=d^7ip*+E#;C{cHvMEs+ZQVmRBp-%H>6qPJha1$<1`mx>zG zn<;seW4WH30Bc-y)N@pxOk3?ZQqT7dBm~zvQ#jyc38O_ZCp~QXB14v+mi0Cd(lIt| zT>-aBhs~f{5o{(G4u)PeQw!~nX`d4eAw9z6u{xu1N$V>(pL{rew5@CyGB;iWguor0 z+-h9hzn}HG$W`a8ttITV1QG=hF87O1;gum@=|<~^oMb+Qm~Vg zx_}O6r!+Ou^1Rfg>VlV`4Mzpx+td9fl1x8cLYis8hcfZ(%YAHfyU9KK{U$G$NNnd@ zuQ`1)MF)Uf>b+~fqPe^B@3aV7=czp%8pfC1dI@@V=>`-<&>THY`!EtbIOGi7 zUS!xZ%6uCK!5mV02@O3#O8$oO&cOz%1cM1n>ZD8)tzEhyW9Vj%&43AbnYWatz(C`w zNFjESC>Ix0vQ@&gn8G}Qh%(mqGoVEPmOEgExf@8MLAcA1 zsR_u@>{``c+lf~^vdzDf{Yc{bm%W2QQTo%6x3wK(UTdTJeutYAa}bQbW6C>Ve~+4c zL46UI-Ad*Wmv8z*`w4H- z>PJsUs<8WxJT*lpfnr{!b0;G1^O&F?u%f%*YRiGyR9ReE_jJF9a7P}1e0cwmY4Y)@ z00~}?`y;TyXok&5M8XiYz`ga#nat~i>ERXrhaa1`&^UeZ<>L^jpA5IyMUBLXO$tyo zl;=Z8QE2`epZb-qso+IbDye%%p^-#Hr%1@zK)QX zbwLk|pM>k4og-l6JiSWutHhwT$f@Z7Kh@t&5I&8Zg_^a|FduEXqF3rS^h}|B=M8m9 zxq>DalnKHOZk=`@%grX3AoS_SMix5@>=)U_t&e^lDATgZyB1tJf!#~N83KNL4J)P+ zMM&RcbmYc+s6>e>Epj906RbXP6+?eEb@l=9iD)jeLg|{jYiS@12wK0NwUzRow5rek z6ukr9V-F5AZILS_qzBc$X(^egXgG9~6nkVwpvI|jVBwki>)ZOxV$Fg62CDo=pag3> z#VX#>F(nUfmaHmbfLtJyl}~JT7WyU`G){8epO1<6IOAb32^-UZ8WmL%xQCtmvjz?B z!K?mX^>k3Y>)?h4wJn!6d7Yv}YigKNaF%{Aue1s{Ndj>q!r@l=R9$ju#X6W9`xJj{ zkjgIPcAhp@ki||lFGEh)=%*#6yMthU0q05WiGc)hpk}-T)fAp~)ZiPjuBfWW?%HK# z>-AVnOgTDQM+JHa^GQ<@@OBG0qO!qqMo|R_a&oTt_wav6KA-W)3MCpur_F@^O=+{J zF1+y9_4vM>#=!^#edM{-*RmM}-T_w##S0beVFeV0f$fOU>ec@1{Z@5}i?>nt-E7$u zRaD1=j1338dX{_o=LBgiKPZ(JRRrR`#ec;9NJs@60L8vO-JBAF9rQnj=33#hvxzHT z*ZiUqHEAhG=U5rkE<=8gNI>;zUaHwx6tt7%9sAfBRbAz|cH|vviYAq(87ViWugoJn zY5Gx!+*h4o>G*?~s3_K^`^7Q3?Ub)bWax!-6EbN3DdIOvsC^n(00|aIOuR$C5$|>( z6IpdlVESG<#bSK6L)KQQKZF%~M1yaNm4&I2EGEq<9N_Cq>Pw1I!^C~BB8x{<)VD%t_n?FC!yJAc|mm;R> z51?YFffCdgsvq~JFT?Itgt5p#8(rPJv$MXfEo&%#$>m~p%eG@b)JSr${c#Cm;{%D^ z>THn)hILL8VRwt&jJ!6dUDpE?od;3{=eOE`zAK(nsc?2K2@{~wd#VT3w1$G4>Q#eP zP&)u6T~;W4tLes6=@e*4tQ@#&Y}q@sw;Qhr5;&_q)D4H8&XoT4ihr<> zMa>WHR&a9)D?Bj2r5F~;Neb^1e+UdSm&*()he(h6vxF%swc~Rz7HN+JpAhaD@p6hA zKN)%A1O<4R%TD^@xNJn0!|dadk_yqS7!&%BP{Rac5ycDtGh0ve5xv0sHvtW-Rhf-y7*CHzP~RhS{Sn z!+g`3(ZJ#b-l|B-JhaX}r?Q3WYW~seW91C0e}n5?_=|nxOJdU#`7RI8nRmPYWn?dx@luemcoBNtH6dbR`{Q=6H|M{%SBQb zsxd3?8}Dy7ISF#xt|W}{yf#}C}4qs!ANo~nlk1U6drp&KuP zCYWZ)(?f=9#CZVIUg**E6g~$qf2jbRpCSw})6nU^!}Gty902MkpbKmGw((GrTK%x*6$`u^sPg>&U^5r+dz8(h?1dDl@|WCvJw6%qg)Tr(CQC1 zE!4og_faY}5I4jg^zXr(mmdMqJ0-`HgXirY6YHj^L({U>XF!G^jL$X(H(!_gerIR6Qai)0AE7*|A!Suc!P2k_Kgyu?`HOMZLER zLh0)D4pI0Vjv4NgCG;L>2o6Ag z4obpI35bZawY6gpw^7OS=J27q=;&yT1y>SazkX5(Engk!9Z*1DE<_lMlvK@kaV_6B ze}m&VtpyoiMEQIza=?Nyx8KjQ+IsII`Fm4^algEV6$MP|7f~Ukb%G7pr?rfEU$^92 z%5q3}zCL6*&q*biXM5nO^l|UwMAutN61{Shmhl=}a>3@lO1lKFza80(-J8>k7I24| zq6YVs^}Qh$9g`crAAvE&#$2Jw%EcY9x>Q%Y>@HcnGL8&2p0zX+JTGnN8cQ~-O)<0* z&5Ta4Nm~DilUm{2O{e8(|ugS^^fpmBXMQU`-2)6A^liBSr+br zNxL3hZk~=G_`W!N+o&T&uZOjO6#R#b1>L?`V%(%kE9#-1>g!qKHGsICW?Z$ts8h_b zqpsYenk4{?MExqRTi%>@Ajt^cRaeliorFG}HhZIC7n(E<1)Ob2b#;KeV$W*IVDyB9^LVvDGG zpA6c!8T@(w@25N?2WMldJmFj{_;;Mx#UJ;^J34wj1KG%M*uOI`oQ`b{6_8gtnKuxV@E8vlmz9&V^hdrM{@>j{mpeFz z?l&+#GHK=&AG0`Be^2`xhm;gwIf38r)1au|4i2q*cW*u98!_+a0kWn+|9IijWuJ-ew5xseBo3<|<#jHZ#0&Z(J z-_|c}JUb^Oal{1(#uQ~Hp3!iu@e?|Jcn$0%7w)EY-+w84)dH#PU)4qxW5m(+I7x@} zEYLlt@PXrgo(=O`nljfU9Zk}6byLspXif}m&9^Wr847~g7{|25+&+r;l)@5x^S0(S z=j&87{&v8DB~?_&utJj9p<1ig`E5!*ub7tHRid%5m-qN2S*} zqe`6f{PSwYwM|nuIaq3KNFoeg;{^FZ%z^#O$E{ugr6<3TGxC-lOlKAUE<&YyqZ{w++s(Nd^k zy4}Qu+yR^c39&N3XT+D^(yVSx_cdB5m(ns{xEHAk_%&a*w)2(H`!?;*_lA-RV{PaB z!)^QUwd~&<@Ra|_ND3Ogy*hiYojPZo5r0u5#nz)>aIvA7D!R)%laa@HH7JPMYv6W422$L8*m@F?M> zxnM-7P3W}7`|O%B*%-cD%Tx8pwKz0Ad-Z%C1d-{Y6_bl*v06R9uw4Bl2-0p#NGsCxSJX`bVsQV68h2sUSTntJBMjGOwK(&raR9UQvy;M&C| zT9wx=Vx!F^hJ`g>T4W8xK2Iug3K_0lvEr@^*bi!%@m_0hZ3!!KlF;Jq*pkx@=QXW6 z8q#1HtMTOVO)rw2@L0LK0wV-N6PBnQ0QKB?OK5k*wR4S3{5M3fhT;gE+EQR5V~qQu z-Hn}xkNQfi6TOqEfCd=@FR$hH*y@J~CS?(TZAS)8u@6E)(#GoJd@nZ;KO^sGRsRJu z{{iqq+c-+g7X?7`w~gjo3B_`yh;_+TFt_FCj>;POvBQeUY4R1qJsp zTOp_f7gnxr?5j^c)Mm@NH;ZqZ2dDAJ!eCkBv9b3(zbpG48q~$^!-T0{0vfWn<;v9& zw~HHVmllOO7B7Z3+gNHGk0;Wj8|;m*7q{-Ys})Z>)Nc7UFX@|25rJDCjGL4T!=weW zwzT`wO;u!UdK2X7_-ia+ZI}z2mu9T#i2UOz(JTL>kF)L{L>WeYx(PN{jS^r48 zDSrCzi7_d$CGG8I@U(L#R-tL-ILp@awBj7*d{&-mf1%jY>BhQgVMQ&?nAzqWEO>Qx zlZln}+kMel6NU1m7TU>MQWowL&t@M8TCaXy zwGp%|8=x34dDxFWyKX^JZ_LyJKJ z$!PPp?M(rk+DqY4hytS8zKLS!+yn`o-`sxOe)ep*_`RlU4k05BMZ zNmS@@1eoOTb3k3Y4d5F&lUt&;dtC?XGsSJfEYhHS6*tJ5;IY4KqN%cgsqZBqDm|i#oHE(S9i-QxJ-siLV!pA*7r0+4GZSqe<6>Mg&a* z@)j1zQYWo5CUrj45nEtq_JUL(1_1_TLt&1KaPR z=wl}jY$RxQCly)8`rv^HL>Y;MJ4_C48HPhtfIe&hg1#mHB_lKGU-RW4jO4+kVBZH_ zy$EEm%FCy#KHRlu4HvFC7DFEW+1M>$gRf)a<=p4cB%yj98l4r%4P#J zKlo)1m1PW3fF9c zWxT{E!D_-SkpWNIfL`GjnJe~72-8~UAywign-9f%9aEoBf{Yjv2{qt^a4fe=ID5|f z0lY-~zh-ZN=^}0OT28`>G?c_u(Fxg(&Q_u|1HUx%5;M#b1CG6_yqxFr%EX6U+WOC& z0v;5;3@NIFAQ~Ktmj_{GUyolM(7UxBqUU~wzze4;Dn-1+J0VK$gZ*o}i5ciM$e?bi z$ib}GLz=1bCsW(m5QQ;r#b!*P6XLFX6Q_0pAy zO3zOQ-kVvS?qK0?nZ3T-|9J-=L=H3p-g~Fk|8pRew!83+UTN7^X&(?z@a;Z+Xt|Jo@lieZQ)D{^Z(WgHP^3LQ@ zEh|lvqgJ@)6yF+;8FshI%jjwWw86D5gZ-OL`T#ze0CI+feD$ zA|e6l!}7z1r{rvH{+CW+O+$V~$*a znuJ-Z!PFhB>JG0#;$GPpcg_)@ZL}Gg)iok6TlFeoM z`3#X3hEP_i}-(M;j%|7L~8=Wmh`bpj) zcMQ(enyuUVMEkJ|Uq$e|ZgQ)ddp=pJ^xUe8L}jeDd+pR`@jm6fy6ynb(F^2RRZPnr zc!2BaHhQI;iaC5TpvmKC(9v$LvBmIkdxojWGKufU>pW5~3G%-iQujrJRF=bsy{Oz# zjNg&sDH}z_s`8KL_ZLxm>7MByfkL!ix`?pBTGm@j3H!%W?4oY%n~a`V-YEdjexWquV3a6l*4 zO(PpjD;VoIVW&}~nOjs$VOgc_PDxWxGrzo#FTJ&6%&audRW~j$HQjLT4c~@Tb_{zw znP16-dz`}HCyQPom^qp`m|nfs;Qe!274%uB>5lHrMzHyc3lLcE zg$JG_CjFIDB{f6#!Z8GX?s}o5$bW0_&K`Fx1RkW3eAn(;7FQYA!X8}tSq%yjD+#duvthmw$H-`KF?xqTz2;;;b*^mp4gAg~%u0@3SnkYMzKBx)lg|qHPMQQ!4-5+F1XC}=0==#6->*4yaP4KN7Vkw-J)AN<#Q3i< zhy48VV!1Qq*nw=E0#i!YpE8n?PKszX7)%DvayNc)hifxGYeO;%o%Ce)zujHl*1jyi5unM?l7QoB^~ zcb^8TUE%nml|u>`rBZn0FD$A=C6QuAG%xdMNkR~T`3koV5uj%^_P-JDPuFys&t`bt|DuoBVS zPt4)qeWy0nGBv5pL?ENMm*JE>=$UE<7hN_DnwP2`d}CgJUO7HBl&=k- zQ7z+UX)Xr>GdUa17s)i+UK?GWuyd4$9^|4`944?ven7_~b1j5OQ2JMw{mBqv?aQ_A zXE4!aj3VOb%*m{4Wa#&5WIj{A@E5Nae^iRtMoT>QV zH`*$5#Q){MpW)#=D`FKFmbiF**UdSeSuHKcyn{=~sfCb$SjRvyj8}Qm`tW6F12F+0 zI2+7#6H(PcPoHj!(Xc))n(pt`{uE~iQVZ8|Kzd1xd?5M+OI9NwAqUOUpY_-$^Q>|Mxk!njdVVAy= z`Bh$-zF!@SlfUJFJx|I_6B{Hgiq1`dr|>`jvBarP2Zo9UbfD;*jS;_(fs?D{_E=*)JgEfR62eXQa-%6 zv3JYM=fe}&v?JK4k=Kif!$PSiuaqq`GTcz^5)Ye!NAX^~tAF|r-=7h4#$HH8`cITz)wC1{ql?18I!Yzs=%*J>#c-JIN=*ZICqKt1ES;b(c-F*eE3fgTd*t(XBrVG;nK}5NPU? zF36_`b#vC;$Nt>rzPu}x(!ULSEEXlyyO8~I>Cr!I!j&Kk7WL*#OGmQ( zRAkyDsaik@8);i9)y4gC5F*X!hvTa=ao^=4%tVcD$HNT@oyB0S>xF#^TSm2Zi#nc5 zQGMD>kD3d{KHldxkqn=?H}PAx!q|Y|L87|$pV1Il8KFuXc&c|TF9^iHklE*X+QN7+ zLTb8R$br^zF;g!@O{H7;W z#CJ#b463&T;iwu_>GOHKsX|>#Pe`>#ok_}WSyfe)N5_KFU8~^apsPVnHYgwa_|+|E zmetC>ZSC`Q&;6fSRKB*oeq|AqWsnyjz5c1uh~k5FmB*rJ)5T&EL%GCxG4 z$M%Ft;VNAu^<)7@q|Z@2KL7fUZ*(o-g-&1RWoutY#4F=*`vR_Q&yU4+uQR3n_NR6S z?DvoJ%L`1q$M!e*NmmbS>FqWJ={I_@jN@4*T6%C*ZBE%Yx+AUAHZ8w?3c=AyL-XTD z9F+B*CcoFYHx0OKvqC4i$p0G{{qGZXa-&)7;2Qav^5ieAp~;DBa*Y{3HpV19s8UX} zhpIPT>!;gKhIIrf(JX9^W#%*YC#|oS&NZ5RqJ*wYa-%^;eFEi^{^Wsi`cA?tH~GtS z)Y`TX-#;P~#`m=yr+r@ep+2f?oA_VX|10{bg=6_HzOeiA!2ce^zm4wBqp0YlMP7kW z?2=uC@3Jl@sz#ACvQcMXU4Uyf_C5x_IkSwMtMYE5o+?2jC1UTO4b}RdYX$d+9K?_B z^PQaRxHe9`{(Yz3Oz?}mxmo{o4`!eh2VitAgj`ze~uXyd_OadaBzXr1qFn^e?zhj{~EZNcB01Up}6#x+5 z+4l|q3-$kfj_v7pE3OXh`H_sDGdgQVyko_~!(IC_hR@NCtElH>yb_!&@M(-BpXL^I zSs#XE$i0W-`Uf8A`AI#b94FuIhX{S!^_3qv$&r(`oF6WyN}b?)+h%7XkmpX(go<&m>hZX;Ck z53mw?@c6NIsbMY85aU9k_t82B_Y_mMJYBU{xE;DiOARO^|EL9|wnWbG+X8r?DBoUg zl0B)P>&{m88#~qM{<0BHB8B;SE$9`!hV)@#J~1K0O&ON?8Z>{~z>k^?N&t}Gq&0$1 zugL4X*RH3c_U;ZsZO?A90GFXgJbX`(_cLZ9F>Tv}FQwF0=Lgc|K4EbY>}O13zai zc?4-e_RFqY7fmx+Cy4>6FFtPu><0WETMuN<7yZt{n}4-W9J|nenHPofMwJbwin6!k zN^5Cq^0`lv`uD%Nt^lw)aybF}(l*>({2w>X0f&k7O4rwJI}FzN1fOAy zcT`86XWV2Q97ES|}rEH)_ytF-KY0aYa>NphZQIJfS^*i9AvwC@G)A?or}vC3_alk!RDph}Jg@O|2& zG9o%L^?85snc6)#27S2gh2BEKCoby)CArhG-=_z#CjZpeW-Dm9MVVG!p076A!{Y7! z88v0<_nF^w@t67OVRBCqkf5NU;(q@su4)u89dg_utH0lM{g+fVkt&)?49pPcpCLj}-W5#K1m!bCh9TT^#hPLqg6 zr%c7~Th`pA^Zg}pnZhh`kEWPAUsKP7&d%`9GJyBFtWiQY_WDIO4pb$E&zL&m0;_CAcdj{MMutZgg>SLpWexkd= zeY)$QahZH}(^9v9Tu_IBw?2)@@o8m=tM}mZOFOG?DNQHlTKsCgQ26k2rxt`rwI27| zw{IJ_IJ%6oOat^YIm(*nh&{zDx*ntahG^F(bE{jEPg;t}DJb$d%IbA7r1#a)CixrZ zM_omRaIVv{aBk0@)o|@?>pY(c>N_D z7uVj`-fOS5*LkjBu|D{k2ge*u@V=@}Y{C@lwb-tru*qKLeFHgPRkr(JnZ5o=ZUB4# z42Y)b8awmt@1Z9Da^8YE*`k`+r8DYpU*}^>p-g2#NhYtt-!M6c)1Wthl@vW&L9DSO`4$2;d7-)od61sx7CE&WfDAIO{NZSh0p% z1Hy@G5^3RzrCQ>hw3z}cDu<2zm3av)a2wjv$KuTKu?%0!Os976DJDLv~!qf zr?x-Xv_BdGsjonM*#}VaX+BrDmS!RwPzd(DbqGnw`xJlpmxo|UPVaKcETHbIe)x@^ zEmy$`qr7pbLp~Vg&Nlz}EsOuIImMoR9uWR*1KUj(UOty9tO!<8;q+y*Om$o4PNod_ zfbw&uY)!OQNCg)Z4g5AsOz*2hPmvWMECgdk40th0Swz?By zsFbXn++t3^`l?HZSNBBkOqZp$rG7%3&Ae>SSbW}g4++l|+rvcjuy@GZjzW=rT8^5- z1YYxe>EPbMMJ>i1_{*?n{|Ud*wffb;a5v=KXrW%o4Ibkyc)+q}CLdkoab}+iIA@W-ikXb{k4sOPx03 zZNYX4NtBrj;#Rf8kTSLjmklv{!U^O6dk_qdn(aqU`P!u=YX~UgCS3}Eu)xA_`FHHA z!%t~Zjb4@H6tM!&A4>v(VM-LN{@JyAdr#9Z>}0In=i+za9Mb zbOC|to9kNcQ(XN=8Csa`ekuOUxtAoTh*viw1Q6k)^8RZ#sv-Jq^W!ys&$JK`OB?I! zcLg0LIxI9D^lfczGoWBJfNfo%{Vc@Ln%hx~aAi1<%Ge5TU@Z2I(~gNXmA7*0V^a2{3Izy-G#`xsosQ%& zj|(TEnKEO2j2usu6ct_DX6Jz*)?+ml6;6R$wkvrBR!2pT3&D&mBJ;IpWx$6-GNpuP zgDN=j0ed*)bE_Yp&MMog;WFQii24-V-OpX3(LwuM>u;*RFo|#t99obij=up3G#WuKUSx0GCi%{oxzw68(#sTS!M|-`OtBlGsZg@JQe_1}Z!>=;8y=R)lLu<9*vyyLV8TD;#GC59#!cCpddyaIlu>l+jp#i1NwAtLmjS^yf*Q)x4# zazaISOB7HsC&(Lz$$zhw%Ek~FJWJkEZyy_GsO>qNt(a;Sw@#<6{phJa*pQ z0N_#1VcRj2L#Sg_2_8KF@MHk0(2Y#(#jgPK&B8_kEjzadY(~z4y??_L35edXY8rjg z^tn$Y8;(;g1SWc2BTzT-4Ni6GDt^sd?@_FaZj!2@7|w9e@at3k9WP>h?i&hwb?e6w zAp?v}z@v{LKc+2poA4+``~2n5fAn>LJ?G<`HV0wfQvY0E{1&9l++S?UDu*aEEILrN zx+*ojcQNX=_tCpL!oNwx?;18<{xJ{%QKU5l>g?6bXc;1xhIaA$bY1;q`yt`QMjbGq3dpy_NO1G()+OT>P+_RCvWMlWh00*jfZqoM`VcT1SRSubaxhIoyZ^wn(R7?Y49>9>K%yls=>J57e+d1f^JBN|6Sy43 z>Ya<+Q|qL`BUZ4ofo+NKEmdw5!YKVVn?|}Js*_I`)%2Ip z`23G_R1FZiqEv=CX=>QnnbmNUZ;=)`oVeP_2QxUTc%*W(r>lnB{SH1L9dFA8`7aq@ zB+qQq0gK0`%kU$Ej7&q{+*dqX`GL<2{Iy0wL|Z|V?Cl<`v|@L=D0NIe*zX@b3#AU*`{4M#(_&7Or3@L3oRW(GPj1kFd+!DX}Uj${E z?d;LfEXjp$#7CzvSz6MfFU}8nBQITFaLHZ$ z`wi&N2PUYE5D4by=j9)a>9PlC`6ANcdLIGYGGQ(2Gy(Y)ae_q?@?T0XfQ?>X=Wm;> z@9w8~a0#r0og_`Rt1z5wO5C26G|UWK%p{t1{qvZAdS3_U6&jikP_S`{k#2S4Ib`r} zgaOK}m!(r}_;|XE%W?MDrmBlBuM?mTMt^?u$|Afds~^CMFY6;ZC_P z8ZyeAF5&5z6j*5R0*|TxOb_f05f)_nr$E7|GP3r74?waP;w=`m^xhaVT-n#0RjkKo zdU$ejQV-;73b<`r9azG}{MJv5ZHMafvE%;!{`GT^`_`Lz8r2p9_Dvh(7ncX~W3hW* zZ<-f6wj{3-LV(0WRP__G@hu>ko5fFXF2Lw+Kte{qE<2*sc+|tJjd*8g#2B)?@YO}Y zYPZIFb3*~hd7BsQy+C-~@Nu_gZj~+a1x?okeKXI;E)B{3LZIB$T0~^yy04q~rpHGh zY^Y^!e&xDWzH-n)LiHH{9vP?j=Ku50WrGo>3W4>9-oM-FXbaE*@@-9`28rci0efNY z*r_$|-Tq+Q(ulFc@Tajye!`Y1&mL=%00og@FF1CWN#ReRFiZc zZi>33EL~(7DCsn$JA^6JV%)|l(77&bzL2uKGFE4oNSyAH?O3-rSQ3s{vvaJ~%RF|B zDWSHUKij-12b3`PYEK~I;GFkagivQ1ML6auvkls_4zWEedR=>$t^FG#}DwfeCWS!U#R_#f;r-GYu39WZq`cSuZV=<-0qzK;&sr&k9 z{S4X<*F@=z1Dm=pu^n%16!_I_0HrEdvO`b5L4$*Xff9fQ4R42m{)MA!hX|sl=3e{4 z=}zrSMpb*Pf}%x5q9>n1=^JK--Uryt3-920E+Eg&#%5`}{Fy+6(?OTuvukuW z-UmpYI!A~=nzuhR3kHw{ob2Ch+BpcvjGdb?8?D7j4#F~3yR~Y#9Dty*aE(_!KUXtZ zX%{{iR;pX232Rzqic9(Ripd-w?hYh%DcE&HRSK*g4HTQVl&wYqiEcd}pVU!3URyU8 zk1pP*rSn)DS0Z3hv^kS2^vJgBvs`$5N=otNHB=oK+vNdzVCHCpf>w(IS4BplE;VX1yO!0n5qFQ#y<9JkhzcMJ@%Go6Fki-V7yh z8zX?KaN@g>=XSm+P?)su#2c6tGw6&OFOQVw7PBzA0;0w9=wO0Bas^(HISQaXje4Ei zq5#K#3{)up`HAk#lyY{iV9_|~`dbfx@|nY0Q$N9R3EyKpw>4UcAlI=E@4U8UJ!oR@CGtuJ^Pv7&qPmLEnEQu_=j zz1+WDu3rE?=ID8?;4eNVR%*?tdV6HdK)|P0OdNOg zEi0T`%LI9YC>FV5202+GIa#^rJJ5@R`5ajcY|rJ2 zUp$q*Vl$}4elvem0jnn+&2e6tsh_tmiPhmdUyViM#C zwH}dubDfuC2BraBaATh^s384AleS&^#`{EuE< z^DoTWa*&ac`GW52+eaaS5)%_kmURt*eD*CC&u!Yq zCd%P%4+9gRAFNP2C+xMEcRX3-a-Z^GfWa$UOD1){C}`N7w*ECJ-)89urF{ zu3?~imF4o(fX+3d!B>IA!r>FfV_p!0r;rZmt;n-N$pMZ1&0x-=!_^^Bz~sBZd>tRF z)R*V1dSzelmgtr@cBi(CaakSr4I)TT?-S5wUYpk&QjA#I#C}*M?aV+&sT*No5vqO^ z&*cB~x&K}VIC)2O&BrtmZw5m@hrDuPri<4)cwwhv>;-Ka4K&J$Riqa#sc$z%Mn(b| zgq_Lj{5*%O`b9G#Aul_NKHjC?Qyt?GFfNoeuVn~0cDwzJ-JkLKx$`F1(SaY}Cq5{a zQ!lN0j};p1$(?RVD$m;XSZJAP>Wy9cdy8gGZ3=G_gi2G9$8rXxW>yJ_Kq^K{CTHw> zUprJYYDSG(?AZJ0+P>}f=tjHhGOW7AMZz&6gk#jC1dtZ4|K%H}cS$^x ziMlv%^1e=l*KTiIR77+wEc4a5M}W%QduxP*hb5TMC|qGJ-_Oq8e-0&+xm50(a&ih< z%L>`!uPEMNPp|x5F~v!K-9-4iCJVw^pRRS)>jDLnL|_y3z)bpTWcNRi0k_xNuf`Twss zGHGOa4YhA>s`Rl!C`zv?o&``2oli24FZ@B+{;GYa>!eCy^sx!tDQ{X@S{Bk{toLna z67$WPtYWR>n>AWS-oFw6k2Co(KF4|F0T_qqX8ApvpYPrvL@HN)Cx4+~l8HekBsFAR ze^h;NoX8M>fueQ+?CyP&RVVGwx8ymBsIjRjUu7YJ4;Q*r;n!u9{NPE|HD3KM!kMjE zQk4BIksls4OX-)IaM530W<@@%6z*H8`x@>aRbi>p^?_2yvZt8Yy|ML!Avo3PSxX!T znLZQ-1hm9r96icmW@cua?JH{*Q)$T+{*%sla{^Bntq~us*W(wRuk4cm2b?1!>gZn5 zu{;_zSL#g@Uajj+=2QCSa}hZ+UQJ)K7IV;%bWdsuJEWoyG2+_tPkJ)1r^ND>-V%mr zzRnR>Gb3ofp5{_`u(Lc~uN=?uU}bsOOZmMj9Vk) z-FC4PMZu+ZhP>Y@zW;1cg!De7@t4`n6$^3k89g?LW8IyB6 z5n$%;tA_jjy}s;PjY+fLiPy?0)L`C*3OVI|BFzI&O$jQ#R^p`#vn?=p1VhP`il%0iOW|XR zD2_XSs^igT z&2h@tIf3ih&5ke%R(q+&6=TOSt9{frw7t~$w?gT4t4uHLJH4_U`yGl4X%7_X3By8n zhG54oiN84a&tr-BEIZjCLSBYY$5Smc@IlI#_V!NT?l-sQke+{ZY&rM&m37gh4FXD0 zhZmP_J@{g_v$rubSS~~DRl6)x0kjQWP(9e8W?H##A47wz>}n_F%mei(Sbop5;q(2% zM(x_jYHmi!H%lSxancBsSaO)0*bT_tLb<=q447gxzjXMhIK}BEuNlV~Vn2&Yc|o6@ z{KSkJFzx8dDE5q3^Am9jT~T)LRiyzg05{02BtR|ztuqsO%@c~r^rDq06i`{Itk0}d zch`(m#+L5dOjPCX3`|dt3wCwWm`63Yw`vjlJrqSMj9WZOAkj^Xk1rW{ z0Er*-S^s{Kl=%VEBcm#NE7-v0Hi=WkH`kgH-ibL*-5Ic-P@waAw42iE&Ac^bqv?9` z;#F_Cu%aZOw}<*F=hc4>WA7Q9eUoahX>qt!%*THC0n*JClc? zTsrl6q_2B3#1*K-&CSjDgN9mE^{q2;rzAr4xA@@2cjU0uVEqq|Y=W~7WUzW?sxod= zJ9ANp?zNrtI!H)N7JMJp7Kmu=0PUsMFL zl5_-os)nf7)+z@XsuO_n;wVR$!c(lY5`O`w!P9s9#0olI$M4E|XAmW$opqH9dGhaU z`9P~gMP|oO>?$9BBDbQxJQx}1cu=Q@x%P`2$cb9eC6JN{cMA7XF%gl)M?~WHv=-N; zn9`xurdqj53fej6Zb?`~eJwhfkZ(M5U|@j7)B7`B=&M($b1r0Ipf@6&(QPd#KMCGc zJ9+cB?z#nw#)-8E=Z){x`=Z2ku1h&H{rjZ^(pF(EtJGEj>afP@$q%P~yoWe?XZL~N zm+!acc`GR65o*L1WhQm@4Bnq!$ZZ@j?J#1JQYK60YT826L-?)yc&)n0E3NL_DWmR; z-E~>ut}?7-R++LQ%+;Igz$(b`8&@y-XY}3oRfi_G*6{fFZVt1-SZT z+S|>P1b*`sW|6;UW@3M{3hwJ$;A7y`;M3=s*tL$6 zc)!6xv0rY5mhgG+Jol#eh8%{Q3t4HI$f(tVs_!UdJ1pIah(WLVEwNEuI{BJ()}os& zSYl7HEah0cdKMlF!vrr`FDxHMN}dR)P++eqNa^z1+;^OsFD_C5CTO~q2~Mav{ub&5 zjl{H2QIS7Q_}(P(dQs_6E<6z_uILsp1Y9T*;5zS-PL&pL`vmGT&Mt!wF(aj-VhN@R&^vViTdR0N$U zOUuz_NuxF46AiJAT4YyMHL}3D^3oz2W+6eB=rHNi^=eTCv_Ij^_iX+&Ey$2|OPXS0 znAv0@DQoFda7$gf{NB>@hSU6A%3b7@eK?Vix&3hegu&a&OpIyKVWGs%Hih%|StTD6 zY@Jow!p`INZscHt2N@+_ZOdYFB312LKTr|dU>PPO896HSjf`~fxCSTczCRuoTfm&c zo%}Jg1Wm0PXq2f2+&ch3!Jt<`UTHy{b`I09saQ>=@fw6~=gz2#yo=1Y>(Ooi+P(0r zd;{^7U0l0TpJ+d9uH_t%tgT(pt3#e7SlKvI>^1Q%R&1oz{TYQd{lUgLKHK3(?2kSM zF2;28gZ#0pB|iPjF3VeH8g`Y>ziBTBUmi>h^f;(D#oYL{vQSTNK#Gf+-~IIIcJMhx zC=D{tG?YhZ>^;E{sK-3EiRJEiF3fmvYZBwUSi?BjC}&JvyXk6H*{xkx(oGjFsHRz! z4L=zg-0#5%v5l93?I=Y-FLldwwWd~v&=@~tj+_oVX!nTbXrKTTh6>LKlAlzlGDn}j z$;4JqlxyX-!+X;umCjXW+U!4|EW^y_;p&lGZ3nZl)!@e-Z;)+UWE8G$*P76{!^_v` z9qB2E9PG|i#j==o#kf}Odw;f|-2JN9koTdtp~sHfqH`EKyVa-gAcg5bJmBQ1mHv`% zuog~&0C~<9lX=e zf{-TM)M~AXbI$1kOPE$pjpnDLn~4X=a??BesZ6T2$0X86^IP9bveBakj(w zkvgsT^Da=K$_VjU;;yCjl|*BN(qeN&eGN!=*ql8hwx*_r^t?zy3L-ye;=wv zjM8cdBxm?d(LY{c?AffqyeV)|=Fp_XQTd^<+-@1(pY=)Uj>1gIp^u_{V$(bhBTOx_ zk$ni$a_MCYqT*Q8loOcL0&RU)7bZgC-+nJ|tHETyV$%%bkq4N^9O-GVtupS>75i4{ zdLZR0rHtRgHX4=)GO6E^b*2o?Blv?&3-gB8k zOeSSTMW zF;)MSm)fN%fc*7@BMZAddWS@gn>ml*M;6GIT`wK{+h`@W) zJ2Ow5f3C0p(+gPM9W#5DW`BR@y?%dq z8gogPgm*LMqS|c#i8!C%p!LS?Mzj@G;~k9=Fp~>NwcHQGX{yelTg0N%emQo_s;_er zcP;LKS1@*K;m+(Hy@F|k3NwI+O)Cd7yOL6Qrdv{#FBp(`vtl=p&PXR5op+b zjoeYPpYT<?Hj~~n+N0{HG7gHQL0`(Id zs|)Dk@ygSujlG_t^x}m~zVePy#S?^c9XYif%HE{O;rTps0`#`e%7pN*K7b=g%nKXczY0Vg7?)c|4BPCzsq)Wlcnv=ht96XD zpZwVAC9yi|WZ2{yH0%$J-rGM^vY7QU6tQ}bd=@7zo32O$95e;O9|Xm+mI~Dm&5)-; zfDYsW@ZMw|Y5-NJTWhnj0QCQ?rxciZq_27jS^532Hv&F^vhonLDE|8rh`g zZvFVKPpF9W13*+PfH>3?dp?ohPtG6C@V|d2rA$OlEaUdJaBE@W@^!m}{&(jk*Px`V zc50!n6|-n1BDmm3>=rw2R^b+#v&eT9XJB%)s0Iff}rJfyI{ zzyIGFq(~PaCpi4>s$K+MR!d7rb*abvV^mt$15Owjt-J86{;0>O`St8zB019wT0b?c zQ7J^<3r_OhC*S)~t~SV+Co4Id>Jq6WMf76zP8xIo`Xe>@A8sRnsB1R>nA-5k%OrYW zXP;j@>zZuh@ktY=TlEf_|MLZk8pj)iUDhv0cu_QFi+bMIe_^gm1VV><#7O~l6Po?A zE&HR13esf)YFl0xzw{oTu9qhibkE$Mi|Rb$bUW#1xz9i_b2`hkPk~y}{ttp=LL}&r zQxenoXyTa_GhRLaL?h=p17V9g`)8hd1JAUsUAx8q-6*86H`{c@z;Qvh7GJFNAO3WL zIA|4F0WW!Ho&X@S(I$1y%)-I*YSdkYkdUxU$M3>k)wYPZ2-4z;%xH#;6bk7EvQ?z$@h_14;)2zfp-hvqbU+IejVAO^5GVMI; ztMH_IGBL+0@jSPIY;s0|WIiMm?_^_{0q!=>&ZYNmt=573)b=lvdlvO8~gXeEpL zM8IYyV|gJzV8nDGQ|}#rbgsXno1X!ItK+FND8!^FXWO?Xl}328fkx=m%cwYYRg;^y zWkOZ`)hn-YZnbI^xKETe7DbR4B7=7ILkA^YZ+4BSl~_)U@qUdq9fM#4EKeFwg%h8ZZf(c9i@{=wQwH|hRq`;r;n6FR+@6~9Y)bD=7A1JaV6n=N4}8 zB!N+Gm|^L0DELWV6MDfkna}1!WjY-mL-)n?$Re99OZ_xxh{rKbvCJiw%fpblNsoHu z%#%#gXk=w`IV^%qkY;yZAtH05+1xWbDBWX6fi7VnHxz}_Lcar$ng^&lABC#usmfi1 zhtp2+*ImEPw0)!9l8jO27dN{z!Q8Gq-rO4D4IbjL@^g zFy?9JU+A?FNwd|3u$V_uvx`C@i9oYK)d~>&j(B~GCDv~#rY;Y2jeu zHZwMFW}JP0S6zwq;D_wwkp=0mO;xDOk6+SqQT>s7{?~)bz)nCKXnWTVGjX=U6MhQM zjnq%%4Gpkuu=!0f`0?6!b;9ABoBTnE>Dp`GiZ_>7>Bl}i1P_;3d3epu$@AHb;P{@- z-eT7&&Li!%`E7ozTr$tI{El`vk$&}Xp;k`>ZtNLi4g2EkmP(N}^K7K#X%#_Tbekl9 z*B1%Ki#K=@)Ka0!B?)SB!No==$to&Eo0^u$TCZ@E*9To0Dyxh#A(!~(OL+5wkVDQx z@3||Uw1J9r7L+I@-SgIRR3;+mgjLN1f@(`qP}CNXw-aCfEF^Uf1X)Y;8a1t3OFNg` zI??QRj%jyU4?u5>MKx$GRz}vGMZ_i|YW$B`kXtJfJ`+2o>Ny0EGv_XMbSI;`as}z4 z%ULuFKGpq;4cSArt!@>$*iU^ZkK;6uf3Lc)&vWD^tq(C|H7GNQ?%0an-2)ovxvP56 zkAqf5z|b)7NS=zkF*&n_@2cO`;k=lN z-rtfJ-QaP%oPXjyjr_PtL}HO9T&^mHamue$>O}yCH&rqJ*vy$GCT+nx_bog zCzCyMVl>ed={>bUSL@ve^d?Qp@z92;**t+y#B~nfE9NK$6B?YFwSRW9qdM;vR|_*r z{8*Yxi~c92w`9-?gDFTH+t-p5r@0buIxoQ4BgKwBTD++=pUn<#fd`~iRcRg{Hd&p? zvc&Roqk7N^@w@>oBvG%r5?O9}1A5W0-o9=ko;8wGs1*HiV3*zaeH=;0v3=!)|Nfjo z(WCOlQr>5T21X;LXtsC`{ix1(PQ{iJ`p(fAR-`xS1tFWGm`hYg<;SJ;1?n%Ag5^)k zN1gRqoU=Yly-SbOUA=Ic@y)4Oz#S6NN=<>+Rj6jTu<} zvrwVU=Ln#pqD@`tP^AAcG5mluRhIumKnMBHY*to{Q;jm?3 zA1vD5$I`m@^JbW3=jC!%OIQRw zGY6mY@@NSswc5G?Z8et=?&H%rTV$2ABc~poY<5eB3pAlK2B)80rT8%PxTLLhfR!0NWYrDD=IBe+HdU)(-P7z3W=*|B=#dU>ikCx80)pp@2|feFUq{7Ui9L&1 zm;Ra>@%;IQOSXm-SB|?ZWHADt8gFo?jN1v=99581m>FLfEuXboieYBryjQco@!)qVjkW1 z{vf?#OD4&bK4tCr0w}<>t_SN*6LzTxor3P9NXNmu5%$Q2O%r3n2gozb15X-AriD8J zAatC;5o(WI`jTd)`I+Tpo)h$KjAE@>qSFS9>EdK7P__Lb}vDGXpTVn%~CbnO`m^h8VOimUa zaS8%gRf$Z$d8Hzo1Qg;}ddEX1_A4bVEjY25`x&1sr=GWzz~Sbps^;>SFbZv?#q9)k zQ)MAXBuC=2r)iW$EwmE$gb}Q(JD?Ox&%UG%#|;GXd1(=uqH+khsqn^Jgv5aP0v!;tYxfM$syW*tLKtejRAe? zRn%fW%jWkj(zS}N`tH?K7MYrXa5W1E$vTT(jQ1$TE^fQeU;>DnIDJV-i;c+0+-bpD zo}S1!rFQC7`c|@j9l~kBlzV+J@M@j$dHPpz5mJ))-3St3yh7{uiCCmEH2|qMwpuf7 zd9tH9k<&G63n~Pl9PWQCClpjm7|*F!<>PyqMnJaxZTe%4>m0hRTsMuG9V%rb-O%v< zD9u>PdVK&0w+>E_QPnq|tar<+f`d@@y9p*O4!L3@mGyA3Qcz+V;*%?+CzQ@xQ#p=} zCDZrd6|H%tQg!!oGt6UbSAY|HML5IR|5nJVtz zO$08{_tTV*py6!rFz@=<&|+^xU5g;bQ*dkUYt2IRhiz?SlVrN8@~aw>=F}d$t93b0 zBVRv1CJoGl^GMogP8VePBni0LCIYzv+EYtYdQ%vGFhm8_>T9v*L$3qF&iY%yV+?`r z)0|yuSL)qXE<+nYmzVuxl;ST1oC}Y&1&}d-d&zO>0Aq}UzhxsIru!t;uOyDuN%>C@-~HXk=o6RN0Af6a=KMbmGoqUNy}GhDmv(!&J;@z(qXsV zNBr&ebD3^d&nWbj zFw|OZ8K_+uHE^{-$0dmS>?$Kdi#z%w&xK(Ui-ORPV!uQW#te(bus>tL2M6zrNL3ND z1QS+vLRk3qgTp2p+{;ynSNtpL(;Nn#%B|`8pNQVYIG6!jN`gM4apGIInQ_krx{a9x zuG1LDi5tjiuh3lx^4L~TMX+wJ5vef@_z)_jKH3iLqDyeMDz&3wdJUfeJWBLecj0$v zM3EtjGUgdfEh;7-6FZRG9Cb(TLf4D47b&WoPb+m29h_IkGWm|ev-#}Wb>Zo(1yy#V zO0MQI+%1?%N1XNWbO9*DcEItt6ilPiVcY+{&JeTx2l6ZBt&X|PXcMt3inQ<~O z$En6rL)+~@k}>!A(_McU5m-a9D`CxkKHaR+aBI{8lKd=^&QYn)qoM0|tn?Vb0Y>8D zhlP9iR1VAd3@h*);FC6Q_mp(>szb2m+Bg*I^WGLZ?c;Y4PMqo5GW=8y8~=&k&lV?x zU`WKUA_2enYah|=>bF#a>OlCBl1S3As$iQSvs`eXxX+Ft0Z0zZZ? zqC~pCVQW6RX3BRGjLK8LeCk{^si1$R0eg%yHLS7@lTCi?s;r`oeGO`9My#2_<3&SN zlf`=L?V_5tQXNq${$j34$S(I+JyFL`c0D#%D>dN#8X?F_MjG>R!wnhh;_I`0iJXo- z4zvK9SfpQC+!QJVsg7dTX^HKVr2hsNFoindBA6|g76sBldAZ{`z)@lqq|oOlnZc|T0)JicIfeUH&D7)rr@sBYy|GpK z9v(M?2pZu#DbY*Ci+(`E)w&&5M4uDbU?E1QppWC=^Vxy7h*ZD`DY$x>HN7WJp#Um^ z%5-aYM|d#SVYRif-k));Lz3oYjV>u`gLwYX&z)jQLD?U@cvv{<+Px(|1<#vTml9yV z{r?hV&Bx%)nqxEaZJ6D>A>A8K*nRuV?0}kJXC=d+=iG}aSk+=Buw>6sA+cp=2T={P zMz{B?Pv=FIqy+Z+g6V+8mHGu&XY(j9u<$T6Rq(E+@{~b0IJXH@G_gMP?Z*Iu)(d4c zl1>l(`MZv_V|b&crt)Vm)?qdrn`4P&O;dG#P=D(+zY%De{b*M7=i|x%#(Q3{##4o2W|n z$pz*I#+NsCH)C=`$>2G{Lvx$?D!sdS9wKtMn6X*H|NF`Uqa+y3wW81El(B%B zWW(u~7k;r542Fcbcpld4#k#TV&qCq*l~!{Yp9{*zi7D`Xm#fZ4k+~b~n>$;v^}GVn zGKaX1ZKd3=D40Tq&H5WL!|MNO<;$`gUQ)5QC8{Ox1n|Z3hAQs z5WCsBN)WGB06;L=HG2Ewk3<{Hh&y3YK?@b8JA&~%~_+l>j~JngPk-|$8Mo&!hOO>zBK@$>hY zvXJRlL2%Ru=}#N&aH?B(u6`?+K`ceSUcWme#Aen+tZ{}96vscGmcVbNN?qA!Jwbl! z7MIpSpHlZ+ms5!_o}Dte0E`2F_^Y#@Cp+yrMgli4d|g?B*(z#jQ}g>wPUo&n82s@2 z8n02-bFHwjIfTg1%xN3<;d{44#RWQQlRk+2`!)6d z>N-!3EAE@MAI$2dY$Z{OM@Pp@e4CA2>HI@=Xse7yoTBwEP472-t`tT$s^&YT=tZ8V z6-s0n6r=I|56&5*mQIODCxV$}_|wdf%r%e=E(@R&U$t)|0EVAd{}F>@Fex}qgFgsT z2RGJ9JMP>Yh+d)&YWP7~{Q3Gd82CF|xT@|NhH8C0>Jz%mS6`2|o&Y%ItS=_@OgDMu zVWklsG}ltp?(}HaN30HbvhL$s1Tx(LtpA<6x_I*XCt6dg^X$`}m>NKEOfN^>y;b}! zr#I6DDdaGRlx+`3g**fHiy`|rpVutczTN-XJO1 1 ) self.tp_group = get_tp_group() + # Currently, there is a bug with mulit-node tensor parallelsim + padded cuda graph, + # so we disable padding in cuda graph. + if not all(in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)): + self.server_args.disable_cuda_graph_padding = True + logger.info( + "Setting disable_cuda_graph_padding to True because of multi-node tensor parallelism." + ) + + # Check memory for tensor parallelism if self.tp_size > 1: - total_local_gpu_memory = get_available_gpu_memory(self.gpu_id) - if total_local_gpu_memory < total_gpu_memory * 0.9: + local_gpu_memory = get_available_gpu_memory(self.gpu_id) + if min_per_gpu_memory < local_gpu_memory * 0.9: raise ValueError( "The memory capacity is unbalanced. Some GPUs may be occupied by other processes." ) - # Load the model and create memory pool - self.load_model() - self.init_memory_pool( - total_gpu_memory, - server_args.max_num_reqs, - server_args.max_total_tokens, - ) - self.init_cublas() - self.init_flashinfer() - - if self.is_generation: - # FIXME Currently, cuda graph only capture decode steps, which only exists in causal models - # Capture cuda graphs - self.init_cuda_graphs() + return min_per_gpu_memory def load_model(self): logger.info( @@ -150,7 +157,7 @@ def load_model(self): ) if torch.cuda.get_device_capability()[0] < 8: logger.info( - "Compute capability below sm80 use float16 due to lack of bfloat16 support." + "Compute capability below sm80. Use float16 due to lack of bfloat16 support." ) self.server_args.dtype = "float16" @@ -168,8 +175,9 @@ def load_model(self): skip_tokenizer_init=True, ) + # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints + # Drop this after Sept, 2024. if is_llama3_405b_fp8_head_16(self.model_config) and self.tp_size <= 8: - # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints self.model_config.hf_config.num_key_value_heads = 8 self.vllm_model_config.hf_config.num_key_value_heads = 8 monkey_patch_vllm_qvk_linear_loader() @@ -191,8 +199,8 @@ def load_model(self): cache_config=None, ) self.sliding_window_size = ( - self.model.get_window_size() - if hasattr(self.model, "get_window_size") + self.model.get_attention_sliding_window_size() + if hasattr(self.model, "get_attention_sliding_window_size") else None ) self.is_generation = is_generation_model( @@ -206,7 +214,8 @@ def load_model(self): f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) - def update_weights(self, model_path, load_format): + def update_weights(self, model_path: str, load_format: str): + """Update weights in-place.""" from vllm.model_executor.model_loader.loader import ( DefaultModelLoader, device_loading_context, @@ -222,6 +231,7 @@ def update_weights(self, model_path, load_format): target_device = torch.device(self.device_config.device) try: + # TODO: Use a better method to check this vllm_model_config = VllmModelConfig( model=model_path, quantization=self.server_args.quantization, @@ -291,7 +301,7 @@ def model_load_weights(model, iter): logger.info(f"[gpu={self.gpu_id}] Update weights end.") return True, "Succeeded to update model weights" - def profile_max_num_token(self, total_gpu_memory): + def profile_max_num_token(self, total_gpu_memory: int): available_gpu_memory = get_available_gpu_memory( self.gpu_id, distributed=self.tp_size > 1 ) @@ -319,7 +329,10 @@ def profile_max_num_token(self, total_gpu_memory): return max_num_token def init_memory_pool( - self, total_gpu_memory, max_num_reqs=None, max_total_tokens=None + self, + total_gpu_memory: int, + max_num_reqs: int = None, + max_total_tokens: int = None, ): self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory) if max_total_tokens is not None: @@ -388,6 +401,7 @@ def init_cublas(self): return c def init_flashinfer(self): + """Init flashinfer attention kernel wrappers.""" if self.server_args.disable_flashinfer: assert ( self.sliding_window_size is None @@ -448,6 +462,11 @@ def init_flashinfer(self): ) def init_cuda_graphs(self): + """Capture cuda graphs.""" + if not self.is_generation: + # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models + return + from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner if self.server_args.disable_cuda_graph or self.server_args.disable_flashinfer: @@ -457,7 +476,12 @@ def init_cuda_graphs(self): logger.info( f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes." ) - batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)] + + if self.server_args.disable_cuda_graph_padding: + batch_size_list = list(range(1, 32)) + [64, 128] + else: + batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 21)] + self.cuda_graph_runner = CudaGraphRunner( self, max_batch_size_to_capture=max(batch_size_list), diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 37d926c34f..c6dbc7e556 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -46,7 +46,7 @@ # Aligned with HF's implementation, using sliding window inclusive with the last token # SGLang assumes exclusive -def get_window_size(config): +def get_attention_sliding_window_size(config): return config.sliding_window - 1 @@ -213,7 +213,11 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, layer_id=layer_idx, - sliding_window_size=get_window_size(config) if use_sliding_window else None, + sliding_window_size=( + get_attention_sliding_window_size(config) + if use_sliding_window + else None + ), logit_cap=self.config.attn_logit_softcapping, ) @@ -406,8 +410,8 @@ def forward( input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata ) - def get_window_size(self): - return get_window_size(self.config) + def get_attention_sliding_window_size(self): + return get_attention_sliding_window_size(self.config) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 75b086fd6a..4a0a08bf88 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -295,12 +295,14 @@ def __init__( self.config = config self.quant_config = quant_config self.model = Grok1Model(config, quant_config=quant_config) - # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.lm_head = ReplicatedLinear(config.hidden_size, config.vocab_size) - self.logits_processor = LogitsProcessor(config, skip_all_gather=True) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.logits_processor = LogitsProcessor(config) # Monkey patch _prepare_weights to load pre-sharded weights setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights) + + self.use_presharded_weights = True + warnings.filterwarnings("ignore", category=FutureWarning) def forward( @@ -356,6 +358,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): continue name = name.replace(weight_name, param_name) + if self.use_presharded_weights: + extra_kwargs = { + "use_presharded_weights": self.use_presharded_weights + } + else: + extra_kwargs = {} + param = params_dict[name] weight_loader = param.weight_loader weight_loader( @@ -364,7 +373,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_name, shard_id=shard_id, expert_id=expert_id, - pre_sharded=get_tensor_model_parallel_world_size() > 1, + **extra_kwargs, ) break else: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 33451d645e..870169c6d5 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -81,13 +81,12 @@ class ServerArgs: disable_cuda_graph: bool = False disable_cuda_graph_padding: bool = False disable_disk_cache: bool = False + disable_custom_all_reduce: bool = False enable_mixed_chunk: bool = False enable_torch_compile: bool = False enable_p2p_check: bool = False enable_mla: bool = False - attention_reduce_in_fp32: bool = False - efficient_weight_load: bool = False - disable_custom_all_reduce: bool = False + triton_attention_reduce_in_fp32: bool = False # Distributed args nccl_init_addr: Optional[str] = None @@ -404,6 +403,12 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Disable disk cache to avoid possible crashes related to file system or high concurrency.", ) + parser.add_argument( + "--disable-custom-all-reduce", + action="store_true", + default=False, + help="Disable the custom all-reduce kernel and fall back to NCCL.", + ) parser.add_argument( "--enable-mixed-chunk", action="store_true", @@ -425,7 +430,7 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.", ) parser.add_argument( - "--attention-reduce-in-fp32", + "--triton-attention-reduce-in-fp32", action="store_true", help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." "This only affects Triton attention kernels.", @@ -435,12 +440,6 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).", ) - parser.add_argument( - "--disable-custom-all-reduce", - action="store_true", - default=False, - help="Disable the custom all-reduce kernel and fall back to NCCL.", - ) @classmethod def from_cli_args(cls, args: argparse.Namespace): diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index a15ea16307..93c54782a0 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -347,7 +347,7 @@ def suppress_other_loggers(): logging.WARN ) logging.getLogger("vllm.selector").setLevel(logging.WARN) - logging.getLogger("vllm.utils").setLevel(logging.WARN) + logging.getLogger("vllm.utils").setLevel(logging.ERROR) def assert_pkg_version(pkg: str, min_version: str, message: str): @@ -451,10 +451,6 @@ def load_model( quant_method = getattr(module, "quant_method", None) if quant_method is not None: quant_method.process_weights_after_loading(module) - # FIXME: Remove this after Mixtral is updated - # to use quant_method. - if hasattr(module, "process_weights_after_loading"): - module.process_weights_after_loading() # NOTE(woosuk): For accurate performance evaluation, we assign # random values to the weights. diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 9386d7f7af..e519c92829 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -24,7 +24,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from sglang.srt.server import Runtime -from sglang.srt.utils import is_generation_model DEFAULT_PROMPTS = [ # the output of gemma-2-2b from SRT is unstable on the commented prompt @@ -63,8 +62,8 @@ class HFRunner: def __init__( self, model_path, - torch_dtype=torch.float16, - is_generation_model=None, + torch_dtype, + is_generation_model, ): self.in_queue = multiprocessing.Queue() self.out_queue = multiprocessing.Queue() @@ -90,11 +89,8 @@ def start_model_process( trust_remote_code=True, ) - self.is_generation_model = ( - is_generation_model(model_path) - if is_generation_model is None - else is_generation_model - ) + self.is_generation_model = is_generation_model + if self.is_generation_model: self.model = AutoModelForCausalLM.from_pretrained( model_path, @@ -176,16 +172,12 @@ class SRTRunner: def __init__( self, model_path, + torch_dtype, + is_generation_model, tp_size=1, - torch_dtype=torch.float16, - is_generation_model=None, port=5157, ): - self.is_generation_model = ( - is_generation_model(model_path) - if is_generation_model is None - else is_generation_model - ) + self.is_generation_model = is_generation_model self.runtime = Runtime( model_path=model_path, tp_size=tp_size, diff --git a/scripts/convert_yi_vl.py b/scripts/deprecated/convert_yi_vl.py similarity index 100% rename from scripts/convert_yi_vl.py rename to scripts/deprecated/convert_yi_vl.py diff --git a/scripts/convert_yi_vl.sh b/scripts/deprecated/convert_yi_vl.sh similarity index 100% rename from scripts/convert_yi_vl.sh rename to scripts/deprecated/convert_yi_vl.sh diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py index 67e47d90d3..44fed2ad0b 100644 --- a/test/srt/models/test_embedding_models.py +++ b/test/srt/models/test_embedding_models.py @@ -59,7 +59,7 @@ def assert_close_prefill_logits( tolerance = 1e-2 assert torch.all( abs(similarities - 1) < tolerance - ), f"embeddings not all close" + ), "embeddings are not all close" def test_prefill_logits(self): for model, tp_size in MODELS: diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index bb56ebdad7..ba64907eae 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -59,7 +59,7 @@ def assert_close_prefill_logits_and_output_strs( tolerance = 3e-2 assert torch.all( abs(hf_logprobs - srt_logprobs) < tolerance - ), f"prefill logprobs not all close" + ), "prefill logprobs are not all close" print(hf_outputs.output_strs) print(srt_outputs.output_strs) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 4d3f7de30a..8a887912a0 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -14,7 +14,7 @@ "test_torch_compile.py", "test_triton_attn_backend.py", "test_vision_openai_server.py", - "test_large_max_new_tokens.py", + "test_update_weights.py", "models/test_generation_models.py", "models/test_embedding_models.py", "sampling/penaltylib", diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 3e858dfa72..0a477a92ae 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -2,8 +2,6 @@ import io import json import os -import sys -import time import unittest import numpy as np @@ -12,12 +10,10 @@ from decord import VideoReader, cpu from PIL import Image -from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server -# python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tokenizer-path lmms-lab/llavanext-qwen-siglip-tokenizer --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384 class TestOpenAIVisionServer(unittest.TestCase): @classmethod def setUpClass(cls): @@ -32,11 +28,9 @@ def setUpClass(cls): other_args=[ "--chat-template", "chatml-llava", - "--tokenizer-path", - "lmms-lab/llavanext-qwen-siglip-tokenizer", "--chunked-prefill-size", "16384", - "--log-requests", + # "--log-requests", ], ) cls.base_url += "/v1" @@ -132,7 +126,6 @@ def test_video_chat_completion(self): messages = self.prepare_video_messages(file_path) - start_time = time.time() video_request = client.chat.completions.create( model="default", messages=messages, @@ -140,15 +133,14 @@ def test_video_chat_completion(self): max_tokens=1024, stream=True, ) + print("-" * 30) video_response = "" - for chunk in video_request: if chunk.choices[0].delta.content is not None: content = chunk.choices[0].delta.content video_response += content - sys.stdout.write(content) - sys.stdout.flush() + print(content, end="", flush=True) print("-" * 30) # Add assertions to validate the video response From b20daf982a82bbeda120d2c30532c74970bd053d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 24 Aug 2024 14:50:05 -0700 Subject: [PATCH 073/118] Update README.md (#1198) --- README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 04dd913baa..2fc91e7858 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ SGLang is a fast serving framework for large language models and vision language It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. The core features include: -- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin). +- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin). - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions. ## News @@ -248,17 +248,19 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec #### Use Models From ModelScope