diff --git a/index.html b/index.html
index 17a448e3..d8efbf5c 100644
--- a/index.html
+++ b/index.html
@@ -84,14 +84,11 @@ <h3><a class="anchor" id="autotoc_md1"></a>
 Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:</h3>
 <p><img src="coding_demo_gpu.gif" alt="" class="inline" title="coding_demo_gpu"/>    </p>
 <h3><a class="anchor" id="autotoc_md2"></a>
-VLM Demo on an Apple MacBook Pro (M1, 2021):</h3>
-<p><img src="vlm_demo_m1.gif" alt="" class="inline" title="vlm_demo_m1"/>    </p>
-<h3><a class="anchor" id="autotoc_md3"></a>
 LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):</h3>
 <p><img src="chat_demo_m1.gif" alt="" class="inline" title="chat_demo_m1"/>    </p>
-<h2><a class="anchor" id="autotoc_md4"></a>
+<h2><a class="anchor" id="autotoc_md3"></a>
 Overview</h2>
-<h3><a class="anchor" id="autotoc_md5"></a>
+<h3><a class="anchor" id="autotoc_md4"></a>
 LLM Compression: SmoothQuant and AWQ</h3>
 <p><a href="https://github.com/mit-han-lab/smoothquant">SmoothQuant</a>: Smooth the activation outliers by migrating the quantization difficulty from activations to weights, with a mathematically equal transformation (100*1 = 10*10).</p>
 <div class="image">
@@ -99,7 +96,7 @@ <h3><a class="anchor" id="autotoc_md5"></a>
 <div class="caption">
 smoothquant_intuition</div></div>
     <p><a href="https://github.com/mit-han-lab/llm-awq">AWQ (Activation-aware Weight Quantization)</a>: Protect salient weight channels by analyzing activation magnitude as opposed to the weights.</p>
-<h3><a class="anchor" id="autotoc_md6"></a>
+<h3><a class="anchor" id="autotoc_md5"></a>
 LLM Inference Engine: TinyChatEngine</h3>
 <ul>
 <li><b>Universal</b>: x86 (Intel/AMD), ARM (Apple M1/M2, Raspberry Pi), CUDA (Nvidia GPU).</li>
@@ -111,23 +108,21 @@ <h3><a class="anchor" id="autotoc_md6"></a>
 <img src="overview.png" alt=""/>
 <div class="caption">
 overview</div></div>
-    <h2><a class="anchor" id="autotoc_md7"></a>
+    <h2><a class="anchor" id="autotoc_md6"></a>
 News</h2>
 <ul>
-<li>**(2024/02)** 🔥We extended the support for vision language models (VLM). Feel free to try running LLaVA on your edge device.</li>
-<li>**(2024/01)** 🔥We released TinyVoiceChat, a voice chatbot that can be deployed on your edge devices, such as MacBook and Jetson Orin Nano. Check out our <a href="https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC">demo video</a> and follow the instructions to deploy it on your device!</li>
 <li>**(2023/10)** We extended the support for the coding assistant Code Llama. Feel free to check out.</li>
 <li>**(2023/10)** ⚡We released the new CUDA backend to support Nvidia GPUs with compute capability &gt;= 6.1 for both server and edge GPUs. Its performance is also speeded up by ~40% compared to the previous version. Feel free to check out!</li>
 </ul>
-<h2><a class="anchor" id="autotoc_md8"></a>
+<h2><a class="anchor" id="autotoc_md7"></a>
 Prerequisites</h2>
-<h3><a class="anchor" id="autotoc_md9"></a>
+<h3><a class="anchor" id="autotoc_md8"></a>
 MacOS</h3>
 <p>For MacOS, install boost and llvm by</p>
 <div class="fragment"><div class="line">brew install boost</div>
 <div class="line">brew install llvm</div>
 </div><!-- fragment --><p>For M1/M2 users, install Xcode from AppStore to enable the metal compiler for GPU support.</p>
-<h3><a class="anchor" id="autotoc_md10"></a>
+<h3><a class="anchor" id="autotoc_md9"></a>
 Windows with CPU</h3>
 <p>For Windows, download and install the GCC compiler with MSYS2. Follow this tutorial: <a href="https://code.visualstudio.com/docs/cpp/config-mingw">https://code.visualstudio.com/docs/cpp/config-mingw</a> for installation.</p>
 <ul>
@@ -137,14 +132,14 @@ <h3><a class="anchor" id="autotoc_md10"></a>
 </div><!-- fragment --><ul>
 <li>Add binary directories (e.g., C:\msys64\mingw64\bin and C:\msys64\usr\bin) to the environment path</li>
 </ul>
-<h3><a class="anchor" id="autotoc_md11"></a>
+<h3><a class="anchor" id="autotoc_md10"></a>
 Windows with Nvidia GPU (Experimental)</h3>
 <ul>
 <li>Install CUDA toolkit for Windows (<a href="https://developer.nvidia.com/cuda-toolkit">link</a>). When installing CUDA on your PC, please change the installation path to another one that does not include "spaces".</li>
 <li>Install Visual Studio with C and C++ support: Follow the <a href="https://learn.microsoft.com/en-us/cpp/build/vscpp-step-0-installation?view=msvc-170">Instruction</a>.</li>
 <li>Follow the instructions below and use x64 Native Tools Command Prompt from Visual Studio to compile TinyChatEngine.</li>
 </ul>
-<h2><a class="anchor" id="autotoc_md12"></a>
+<h2><a class="anchor" id="autotoc_md11"></a>
 Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine</h2>
 <p>Here, we provide step-by-step instructions to deploy LLaMA2-7B-chat with TinyChatEngine from scratch.</p>
 <ul>
@@ -191,46 +186,7 @@ <h2><a class="anchor" id="autotoc_md12"></a>
 <div class="line">...</div>
 </div><!-- fragment --></li>
 </ul>
-<h2><a class="anchor" id="autotoc_md13"></a>
-Deploy speech-to-speech chatbot with TinyChatEngine <a href="https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC">[Demo]</a></h2>
-<p>TinyChatEngine offers versatile capabilities suitable for various applications. Additionally, we introduce a sophisticated voice chatbot. Here, we provide very easy-to-follow instructions to deploy speech-to-speech chatbot (LLaMA2-7B-chat) with TinyChatEngine.</p>
-<ul>
-<li>Follow the instructions above to setup the basic environment, i.e., Prerequisites and Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine.</li>
-<li>Run the shell script to set up the environment for speech-to-speech chatbot. <div class="fragment"><div class="line">cd llm</div>
-<div class="line">./voicechat_setup.sh</div>
-</div><!-- fragment --></li>
-<li>Start the speech-to-speech chat locally. <div class="fragment"><div class="line">./chat -v  # chat.exe -v on Windows</div>
-</div><!-- fragment --></li>
-<li>If you encounter any issues or errors during setup, please explore here to follow the step-by-step guide to debug.</li>
-</ul>
-<h2><a class="anchor" id="autotoc_md14"></a>
-Deploy vision language model (VLM) chatbot with TinyChatEngine</h2>
-<p>TinyChatEngine supports not only LLM but also VLM. We introduce a sophisticated text/voice chatbot for VLM. Here, we provide very easy-to-follow instructions to deploy vision language model chatbot (LLaVA-1.5) with TinyChatEngine.</p>
-<ul>
-<li>Follow the instructions above to setup the basic environment, i.e., Prerequisites and Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine.</li>
-<li>To demonstrate images in the terminal, please download and install the following toolkit.<ul>
-<li>Install <a href="https://github.com/AnonymouX47/termvisage">termvisage</a>.</li>
-<li>(For MacOS) Install <a href="https://iterm2.com/index.html">iTerm2</a>.</li>
-<li>(For other OS) Please refer to <a href="https://github.com/AnonymouX47/termvisage?tab=readme-ov-file#requirements">here</a> to get the appropriate terminal ready.</li>
-</ul>
-</li>
-<li>(Optional) To enable the speech-to-speech chatbot for VLM, please follow the instruction above to run the shell script to set up the environment.</li>
-<li>Download the quantized LLaVA model from our model zoo.<ul>
-<li>On an x86 device (e.g., Intel/AMD laptop) <div class="fragment"><div class="line">python tools/download_model.py --model LLaVA_7B_awq_int4_CLIP_ViT-L --QM QM_x86</div>
-</div><!-- fragment --></li>
-<li>On an ARM device (e.g., M1/M2 Macbook, Raspberry Pi) <div class="fragment"><div class="line">python tools/download_model.py --model LLaVA_7B_awq_int4_CLIP_ViT-L --QM QM_ARM</div>
-</div><!-- fragment --></li>
-</ul>
-</li>
-<li>(For MacOS) Start the chatbot locally. Please use an appropriate terminal (e.g., iTerm2).<ul>
-<li>Image/Text to text <div class="fragment"><div class="line">./scripts/llava.sh ../assets/figures/pedestrian.png</div>
-</div><!-- fragment --></li>
-<li>Image/Speech to speech <code>bash ./scripts/voice_llava.sh ../assets/figures/pedestrian.png </code></li>
-<li>For other OS, please modify Line 4 in <a href="llm/scripts/llava.sh">llava.sh</a> and <a href="llm/scripts/voice_llava.sh">voice_llava.sh</a> to use the correct terminal.</li>
-</ul>
-</li>
-</ul>
-<h2><a class="anchor" id="autotoc_md15"></a>
+<h2><a class="anchor" id="autotoc_md12"></a>
 Backend Support</h2>
 <p>| Precision | x86<br  />
  (Intel/AMD CPU) | ARM<br  />
@@ -239,10 +195,10 @@ <h2><a class="anchor" id="autotoc_md15"></a>
 <li>For Raspberry Pi, we recommend using the board with 8GB RAM. Our testing was primarily conducted on Raspberry Pi 4 Model B Rev 1.4 with aarch64. For other versions, please feel free to try it out and let us know if you encounter any issues.</li>
 <li>For Nvidia GPU, our CUDA backend can support Nvidia GPUs with compute capability &gt;= 6.1. For the GPUs with compute capability &lt; 6.1, please feel free to try it out but we haven't tested it yet and thus cannot guarantee the results.</li>
 </ul>
-<h2><a class="anchor" id="autotoc_md16"></a>
+<h2><a class="anchor" id="autotoc_md13"></a>
 Quantization and Model Support</h2>
 <p>The goal of TinyChatEngine is to support various quantization methods on various devices. For example, At present, it supports the quantized weights for int8 opt models that originate from <a href="https://github.com/mit-han-lab/smoothquant">smoothquant</a> using the provided conversion script <a href="llm/tools/opt_smooth_exporter.py">opt_smooth_exporter.py</a>. For LLaMA models, scripts are available for converting Huggingface format checkpoints to our int4 wegiht <a href="llm/tools/llama_exporter.py">format</a>, and for quantizing them to specific methods <a href="llm/tools/model_quantizer.py">based on your device</a>. Before converting and quantizing your models, it is recommended to apply the fake quantization from <a href="https://github.com/mit-han-lab/llm-awq">AWQ</a> to achieve better accuracy. We are currently working on supporting more models, please stay tuned!</p>
-<h3><a class="anchor" id="autotoc_md17"></a>
+<h3><a class="anchor" id="autotoc_md14"></a>
 Device-specific int4 Weight Reordering</h3>
 <p>To mitigate the runtime overheads associated with weight reordering, TinyChatEngine conducts this process offline during model conversion. In this section, we will explore the weight layouts of QM_ARM and QM_x86. These layouts are tailored for ARM and x86 CPUs, supporting 128-bit SIMD and 256-bit SIMD operations, respectively. We also support QM_CUDA for Nvidia GPUs, including server and edge GPUs.</p>
 <table class="markdownTable">
@@ -258,7 +214,7 @@ <h3><a class="anchor" id="autotoc_md17"></a>
 <ul>
 <li>Example layout of QM_ARM: For QM_ARM, consider the initial configuration of a 128-bit weight vector, [w0, w1, ... , w30, w31], where each wi is a 4-bit quantized weight. TinyChatEngine rearranges these weights in the sequence [w0, w16, w1, w17, ..., w15, w31] by interleaving the lower half and upper half of the weights. This new arrangement facilitates the decoding of both the lower and upper halves using 128-bit AND and shift operations, as depicted in the subsequent figure. This will eliminate runtime reordering overheads and improve performance.</li>
 </ul>
-<h2><a class="anchor" id="autotoc_md18"></a>
+<h2><a class="anchor" id="autotoc_md15"></a>
 Download and Deploy Models from our Model Zoo</h2>
 <p>We offer a selection of models that have been tested with TinyChatEngine. These models can be readily downloaded and deployed on your device. To download a model, locate the target model's ID in the table below and use the associated script.</p>
 <table class="doxtable">
@@ -329,12 +285,12 @@ <h2><a class="anchor" id="autotoc_md18"></a>
 <div class="line">./chat LLaMA2_7B_chat INT4</div>
 </div><!-- fragment --></li>
 </ul>
-<h2><a class="anchor" id="autotoc_md19"></a>
+<h2><a class="anchor" id="autotoc_md16"></a>
 Related Projects</h2>
 <p><a href="https://github.com/mit-han-lab/tinyengine">TinyEngine: Memory-efficient and High-performance Neural Network Library for Microcontrollers</a></p>
 <p><a href="https://github.com/mit-han-lab/smoothquant">SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models</a></p>
 <p><a href="https://github.com/mit-han-lab/llm-awq">AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration</a></p>
-<h2><a class="anchor" id="autotoc_md20"></a>
+<h2><a class="anchor" id="autotoc_md17"></a>
 Acknowledgement</h2>
 <p><a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a></p>
 <p><a href="https://github.com/ggerganov/whisper.cpp">whisper.cpp</a></p>
diff --git a/search/all_0.js b/search/all_0.js
index 0dcdfc3e..8f6f4df7 100644
--- a/search/all_0.js
+++ b/search/all_0.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['2021_20_3a_0',['2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]]
+  ['2021_20_3a_0',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]]
 ];
diff --git a/search/all_10.js b/search/all_10.js
index 3e93382e..b9c36e67 100644
--- a/search/all_10.js
+++ b/search/all_10.js
@@ -1,15 +1,17 @@
 var searchData=
 [
-  ['on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_0',['On an Apple MacBook Pro M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_1',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['on_20device_20llm_20inference_20library_2',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
-  ['opt_5fparams_3',['opt_params',['../structopt__params.html',1,'']]],
-  ['opt_5ftoken_5fdata_4',['OPT_token_data',['../structOPT__token__data.html',1,'']]],
-  ['opt_5ftoken_5fdata_5farray_5',['OPT_token_data_array',['../structOPT__token__data__array.html',1,'']]],
-  ['optforcausallm_6',['OPTForCausalLM',['../classOPTForCausalLM.html',1,'']]],
-  ['optforcausallm_5finput_7',['OPTForCausalLM_input',['../structOPTForCausalLM__input.html',1,'']]],
-  ['optforcausallm_5foutput_8',['OPTForCausalLM_output',['../structOPTForCausalLM__output.html',1,'']]],
-  ['optimization_5fparams_9',['optimization_params',['../structoptimization__params.html',1,'']]],
-  ['our_20model_20zoo_10',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]],
-  ['overview_11',['Overview',['../index.html#autotoc_md4',1,'']]]
+  ['pack_5fq4_5ftensor_0',['pack_q4_tensor',['../structpack__q4__tensor.html',1,'']]],
+  ['pack_5fq8_5ftensor_1',['pack_q8_tensor',['../structpack__q8__tensor.html',1,'']]],
+  ['pair_5fhash_2',['pair_hash',['../structpair__hash.html',1,'']]],
+  ['pool_3',['pool',['../structpool.html',1,'']]],
+  ['pool_5fend_4',['pool_end',['../pthread__pool_8h.html#a7ed215fb1f5e6933bf970bf089a16e5c',1,'pthread_pool.cc']]],
+  ['pool_5fenqueue_5',['pool_enqueue',['../pthread__pool_8h.html#a25a373d27638bc2b8532edfe6ab056ba',1,'pthread_pool.cc']]],
+  ['pool_5fqueue_6',['pool_queue',['../structpool__queue.html',1,'']]],
+  ['pool_5fstart_7',['pool_start',['../pthread__pool_8h.html#a414561dad8af7224cdbd531e3d9e4a0a',1,'pthread_pool.cc']]],
+  ['pool_5fwait_8',['pool_wait',['../pthread__pool_8h.html#a314c9adaec7a7ad64fe9e5b8bfa3fbd1',1,'pthread_pool.cc']]],
+  ['prerequisites_9',['Prerequisites',['../index.html#autotoc_md7',1,'']]],
+  ['pro_20m1_202021_20_3a_10',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['profiler_11',['Profiler',['../classProfiler.html',1,'']]],
+  ['projects_12',['Related Projects',['../index.html#autotoc_md16',1,'']]],
+  ['pthread_5fpool_2eh_13',['pthread_pool.h',['../pthread__pool_8h.html',1,'']]]
 ];
diff --git a/search/all_11.js b/search/all_11.js
index 5d9694cc..5f243bfb 100644
--- a/search/all_11.js
+++ b/search/all_11.js
@@ -1,17 +1,5 @@
 var searchData=
 [
-  ['pack_5fq4_5ftensor_0',['pack_q4_tensor',['../structpack__q4__tensor.html',1,'']]],
-  ['pack_5fq8_5ftensor_1',['pack_q8_tensor',['../structpack__q8__tensor.html',1,'']]],
-  ['pair_5fhash_2',['pair_hash',['../structpair__hash.html',1,'']]],
-  ['pool_3',['pool',['../structpool.html',1,'']]],
-  ['pool_5fend_4',['pool_end',['../pthread__pool_8h.html#a7ed215fb1f5e6933bf970bf089a16e5c',1,'pthread_pool.cc']]],
-  ['pool_5fenqueue_5',['pool_enqueue',['../pthread__pool_8h.html#a25a373d27638bc2b8532edfe6ab056ba',1,'pthread_pool.cc']]],
-  ['pool_5fqueue_6',['pool_queue',['../structpool__queue.html',1,'']]],
-  ['pool_5fstart_7',['pool_start',['../pthread__pool_8h.html#a414561dad8af7224cdbd531e3d9e4a0a',1,'pthread_pool.cc']]],
-  ['pool_5fwait_8',['pool_wait',['../pthread__pool_8h.html#a314c9adaec7a7ad64fe9e5b8bfa3fbd1',1,'pthread_pool.cc']]],
-  ['prerequisites_9',['Prerequisites',['../index.html#autotoc_md8',1,'']]],
-  ['pro_20m1_202021_20_3a_10',['Pro M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['profiler_11',['Profiler',['../classProfiler.html',1,'']]],
-  ['projects_12',['Related Projects',['../index.html#autotoc_md19',1,'']]],
-  ['pthread_5fpool_2eh_13',['pthread_pool.h',['../pthread__pool_8h.html',1,'']]]
+  ['quantization_20and_20model_20support_0',['Quantization and Model Support',['../index.html#autotoc_md13',1,'']]],
+  ['quantization_5fparams_1',['quantization_params',['../structquantization__params.html',1,'']]]
 ];
diff --git a/search/all_12.js b/search/all_12.js
index b070681c..fa213bac 100644
--- a/search/all_12.js
+++ b/search/all_12.js
@@ -1,5 +1,7 @@
 var searchData=
 [
-  ['quantization_20and_20model_20support_0',['Quantization and Model Support',['../index.html#autotoc_md16',1,'']]],
-  ['quantization_5fparams_1',['quantization_params',['../structquantization__params.html',1,'']]]
+  ['related_20projects_0',['Related Projects',['../index.html#autotoc_md16',1,'']]],
+  ['reordering_1',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md14',1,'']]],
+  ['rotaryposemb_2',['RotaryPosEmb',['../classRotaryPosEmb.html',1,'']]],
+  ['rtx_204070_20laptop_3a_3',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]]
 ];
diff --git a/search/all_13.js b/search/all_13.js
index 942af5b0..eec512e6 100644
--- a/search/all_13.js
+++ b/search/all_13.js
@@ -1,7 +1,10 @@
 var searchData=
 [
-  ['related_20projects_0',['Related Projects',['../index.html#autotoc_md19',1,'']]],
-  ['reordering_1',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md17',1,'']]],
-  ['rotaryposemb_2',['RotaryPosEmb',['../classRotaryPosEmb.html',1,'']]],
-  ['rtx_204070_20laptop_3a_3',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]]
+  ['smoothquant_20and_20awq_0',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md4',1,'']]],
+  ['specific_20int4_20weight_20reordering_1',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md14',1,'']]],
+  ['starcoder_5fvocab_2',['starcoder_vocab',['../structstarcoder__vocab.html',1,'']]],
+  ['stbi_5fio_5fcallbacks_3',['stbi_io_callbacks',['../structstbi__io__callbacks.html',1,'']]],
+  ['step_20by_20step_20to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_4',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]],
+  ['step_20to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_5',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]],
+  ['support_6',['Support',['../index.html#autotoc_md12',1,'Backend Support'],['../index.html#autotoc_md13',1,'Quantization and Model Support']]]
 ];
diff --git a/search/all_14.js b/search/all_14.js
index b18f3133..0fd47212 100644
--- a/search/all_14.js
+++ b/search/all_14.js
@@ -1,12 +1,10 @@
 var searchData=
 [
-  ['si_20ccvzdmq3hwoweqcc_20demo_20a_0',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['smoothquant_20and_20awq_1',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md5',1,'']]],
-  ['specific_20int4_20weight_20reordering_2',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md17',1,'']]],
-  ['speech_20to_20speech_20chatbot_20with_20tinychatengine_20a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_3',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['starcoder_5fvocab_4',['starcoder_vocab',['../structstarcoder__vocab.html',1,'']]],
-  ['stbi_5fio_5fcallbacks_5',['stbi_io_callbacks',['../structstbi__io__callbacks.html',1,'']]],
-  ['step_20by_20step_20to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_6',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]],
-  ['step_20to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_7',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]],
-  ['support_8',['Support',['../index.html#autotoc_md15',1,'Backend Support'],['../index.html#autotoc_md16',1,'Quantization and Model Support']]]
+  ['thread_5fargs_0',['thread_args',['../structthread__args.html',1,'']]],
+  ['tinychatengine_1',['TinyChatEngine',['../index.html#autotoc_md5',1,'LLM Inference Engine: TinyChatEngine'],['../index.html#autotoc_md11',1,'Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine'],['../index.html',1,'TinyChatEngine']]],
+  ['tinychatengine_3a_20on_20device_20llm_20inference_20library_2',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
+  ['to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_3',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]],
+  ['token_5fscore_4',['token_score',['../structllama__vocab_1_1token__score.html',1,'llama_vocab']]],
+  ['transpose_5f1_5f2idx_5farg_5',['transpose_1_2idx_arg',['../structtranspose__1__2idx__arg.html',1,'']]],
+  ['transpose_5f1_5f2idx_5ffloat_5farg_6',['transpose_1_2idx_float_arg',['../structtranspose__1__2idx__float__arg.html',1,'']]]
 ];
diff --git a/search/all_15.js b/search/all_15.js
index ae75c9fc..5eeb6e65 100644
--- a/search/all_15.js
+++ b/search/all_15.js
@@ -1,12 +1,15 @@
 var searchData=
 [
-  ['thread_5fargs_0',['thread_args',['../structthread__args.html',1,'']]],
-  ['tinychatengine_1',['TinyChatEngine',['../index.html#autotoc_md14',1,'Deploy vision language model (VLM) chatbot with TinyChatEngine'],['../index.html#autotoc_md6',1,'LLM Inference Engine: TinyChatEngine'],['../index.html#autotoc_md12',1,'Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine'],['../index.html',1,'TinyChatEngine']]],
-  ['tinychatengine_20a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_2',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['tinychatengine_3a_20on_20device_20llm_20inference_20library_3',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
-  ['to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_4',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]],
-  ['to_20speech_20chatbot_20with_20tinychatengine_20a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_5',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['token_5fscore_6',['token_score',['../structllama__vocab_1_1token__score.html',1,'llama_vocab']]],
-  ['transpose_5f1_5f2idx_5farg_7',['transpose_1_2idx_arg',['../structtranspose__1__2idx__arg.html',1,'']]],
-  ['transpose_5f1_5f2idx_5ffloat_5farg_8',['transpose_1_2idx_float_arg',['../structtranspose__1__2idx__float__arg.html',1,'']]]
+  ['w8a8b8o8linear_0',['W8A8B8O8Linear',['../classW8A8B8O8Linear.html',1,'']]],
+  ['w8a8b8o8linear_5fparams_1',['W8A8B8O8Linear_params',['../structW8A8B8O8Linear__params.html',1,'']]],
+  ['w8a8b8o8linearrelu_2',['W8A8B8O8LinearReLU',['../classW8A8B8O8LinearReLU.html',1,'']]],
+  ['w8a8b8o8linearrelu_5fparams_3',['W8A8B8O8LinearReLU_params',['../structW8A8B8O8LinearReLU__params.html',1,'']]],
+  ['w8a8bfp32ofp32linear_4',['W8A8BFP32OFP32Linear',['../classW8A8BFP32OFP32Linear.html',1,'']]],
+  ['w8a8bfp32ofp32linear_5fparams_5',['W8A8BFP32OFP32Linear_params',['../structW8A8BFP32OFP32Linear__params.html',1,'']]],
+  ['weight_20reordering_6',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md14',1,'']]],
+  ['windows_20with_20cpu_7',['Windows with CPU',['../index.html#autotoc_md9',1,'']]],
+  ['windows_20with_20nvidia_20gpu_20experimental_8',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md10',1,'']]],
+  ['with_20cpu_9',['Windows with CPU',['../index.html#autotoc_md9',1,'']]],
+  ['with_20nvidia_20gpu_20experimental_10',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md10',1,'']]],
+  ['with_20tinychatengine_11',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]]
 ];
diff --git a/search/all_16.js b/search/all_16.js
index 0f1aca2d..b916ee8c 100644
--- a/search/all_16.js
+++ b/search/all_16.js
@@ -1,6 +1,4 @@
 var searchData=
 [
-  ['vision_20language_20model_20vlm_20chatbot_20with_20tinychatengine_0',['Deploy vision language model (VLM) chatbot with TinyChatEngine',['../index.html#autotoc_md14',1,'']]],
-  ['vlm_20chatbot_20with_20tinychatengine_1',['Deploy vision language model (VLM) chatbot with TinyChatEngine',['../index.html#autotoc_md14',1,'']]],
-  ['vlm_20demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_2',['VLM Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]]
+  ['zoo_0',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]]
 ];
diff --git a/search/all_17.js b/search/all_17.js
deleted file mode 100644
index 47e95820..00000000
--- a/search/all_17.js
+++ /dev/null
@@ -1,16 +0,0 @@
-var searchData=
-[
-  ['w8a8b8o8linear_0',['W8A8B8O8Linear',['../classW8A8B8O8Linear.html',1,'']]],
-  ['w8a8b8o8linear_5fparams_1',['W8A8B8O8Linear_params',['../structW8A8B8O8Linear__params.html',1,'']]],
-  ['w8a8b8o8linearrelu_2',['W8A8B8O8LinearReLU',['../classW8A8B8O8LinearReLU.html',1,'']]],
-  ['w8a8b8o8linearrelu_5fparams_3',['W8A8B8O8LinearReLU_params',['../structW8A8B8O8LinearReLU__params.html',1,'']]],
-  ['w8a8bfp32ofp32linear_4',['W8A8BFP32OFP32Linear',['../classW8A8BFP32OFP32Linear.html',1,'']]],
-  ['w8a8bfp32ofp32linear_5fparams_5',['W8A8BFP32OFP32Linear_params',['../structW8A8BFP32OFP32Linear__params.html',1,'']]],
-  ['weight_20reordering_6',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md17',1,'']]],
-  ['windows_20with_20cpu_7',['Windows with CPU',['../index.html#autotoc_md10',1,'']]],
-  ['windows_20with_20nvidia_20gpu_20experimental_8',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md11',1,'']]],
-  ['with_20cpu_9',['Windows with CPU',['../index.html#autotoc_md10',1,'']]],
-  ['with_20nvidia_20gpu_20experimental_10',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md11',1,'']]],
-  ['with_20tinychatengine_11',['With TinyChatEngine',['../index.html#autotoc_md14',1,'Deploy vision language model (VLM) chatbot with TinyChatEngine'],['../index.html#autotoc_md12',1,'Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine']]],
-  ['with_20tinychatengine_20a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_12',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]]
-];
diff --git a/search/all_18.js b/search/all_18.js
deleted file mode 100644
index ad8ea88e..00000000
--- a/search/all_18.js
+++ /dev/null
@@ -1,4 +0,0 @@
-var searchData=
-[
-  ['youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_0',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]]
-];
diff --git a/search/all_19.js b/search/all_19.js
deleted file mode 100644
index dcf8d224..00000000
--- a/search/all_19.js
+++ /dev/null
@@ -1,4 +0,0 @@
-var searchData=
-[
-  ['zoo_0',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]]
-];
diff --git a/search/all_2.js b/search/all_2.js
index e85cdf25..3c2cee8c 100644
--- a/search/all_2.js
+++ b/search/all_2.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['7b_20chat_20with_20tinychatengine_0',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]]
+  ['7b_20chat_20with_20tinychatengine_0',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]]
 ];
diff --git a/search/all_3.js b/search/all_3.js
index 64430b84..6c14a5c9 100644
--- a/search/all_3.js
+++ b/search/all_3.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['_3a_0',[':',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]]
+  ['_3a_0',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]]
 ];
diff --git a/search/all_4.js b/search/all_4.js
index 5bc11ba5..1fc2b843 100644
--- a/search/all_4.js
+++ b/search/all_4.js
@@ -1,13 +1,12 @@
 var searchData=
 [
-  ['a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_0',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['a8w4_5fthread_5fargs_1',['a8w4_thread_args',['../structa8w4__thread__args.html',1,'']]],
-  ['acknowledgement_2',['Acknowledgement',['../index.html#autotoc_md20',1,'']]],
-  ['an_20apple_20macbook_20pro_20m1_202021_20_3a_3',['An Apple MacBook Pro M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['an_20nvidia_20geforce_20rtx_204070_20laptop_3a_4',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['and_20awq_5',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md5',1,'']]],
-  ['and_20deploy_20models_20from_20our_20model_20zoo_6',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]],
-  ['and_20model_20support_7',['Quantization and Model Support',['../index.html#autotoc_md16',1,'']]],
-  ['apple_20macbook_20pro_20m1_202021_20_3a_8',['Apple MacBook Pro M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['awq_9',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md5',1,'']]]
+  ['a8w4_5fthread_5fargs_0',['a8w4_thread_args',['../structa8w4__thread__args.html',1,'']]],
+  ['acknowledgement_1',['Acknowledgement',['../index.html#autotoc_md17',1,'']]],
+  ['an_20apple_20macbook_20pro_20m1_202021_20_3a_2',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['an_20nvidia_20geforce_20rtx_204070_20laptop_3a_3',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['and_20awq_4',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md4',1,'']]],
+  ['and_20deploy_20models_20from_20our_20model_20zoo_5',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]],
+  ['and_20model_20support_6',['Quantization and Model Support',['../index.html#autotoc_md13',1,'']]],
+  ['apple_20macbook_20pro_20m1_202021_20_3a_7',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['awq_8',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md4',1,'']]]
 ];
diff --git a/search/all_5.js b/search/all_5.js
index 89a68756..365f0809 100644
--- a/search/all_5.js
+++ b/search/all_5.js
@@ -1,12 +1,10 @@
 var searchData=
 [
-  ['backend_20support_0',['Backend Support',['../index.html#autotoc_md15',1,'']]],
-  ['be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_1',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['bmm_5ff32t_2',['BMM_F32T',['../classBMM__F32T.html',1,'']]],
-  ['bmm_5fs8t_5fs8n_5ff32t_3',['BMM_S8T_S8N_F32T',['../classBMM__S8T__S8N__F32T.html',1,'']]],
-  ['bmm_5fs8t_5fs8n_5ff32t_5fparams_4',['BMM_S8T_S8N_F32T_params',['../structBMM__S8T__S8N__F32T__params.html',1,'']]],
-  ['bmm_5fs8t_5fs8n_5fs8t_5',['BMM_S8T_S8N_S8T',['../classBMM__S8T__S8N__S8T.html',1,'']]],
-  ['bmm_5fs8t_5fs8n_5fs8t_5fparams_6',['BMM_S8T_S8N_S8T_params',['../structBMM__S8T__S8N__S8T__params.html',1,'']]],
-  ['bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_7',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['by_20step_20to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_8',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]]
+  ['backend_20support_0',['Backend Support',['../index.html#autotoc_md12',1,'']]],
+  ['bmm_5ff32t_1',['BMM_F32T',['../classBMM__F32T.html',1,'']]],
+  ['bmm_5fs8t_5fs8n_5ff32t_2',['BMM_S8T_S8N_F32T',['../classBMM__S8T__S8N__F32T.html',1,'']]],
+  ['bmm_5fs8t_5fs8n_5ff32t_5fparams_3',['BMM_S8T_S8N_F32T_params',['../structBMM__S8T__S8N__F32T__params.html',1,'']]],
+  ['bmm_5fs8t_5fs8n_5fs8t_4',['BMM_S8T_S8N_S8T',['../classBMM__S8T__S8N__S8T.html',1,'']]],
+  ['bmm_5fs8t_5fs8n_5fs8t_5fparams_5',['BMM_S8T_S8N_S8T_params',['../structBMM__S8T__S8N__S8T__params.html',1,'']]],
+  ['by_20step_20to_20deploy_20llama2_207b_20chat_20with_20tinychatengine_6',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]]
 ];
diff --git a/search/all_6.js b/search/all_6.js
index 5609a513..4e56ab79 100644
--- a/search/all_6.js
+++ b/search/all_6.js
@@ -1,17 +1,14 @@
 var searchData=
 [
-  ['ccvzdmq3hwoweqcc_20demo_20a_0',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['chat_20demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_1',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md3',1,'']]],
-  ['chat_20with_20tinychatengine_2',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]],
-  ['chatbot_20with_20tinychatengine_3',['Deploy vision language model (VLM) chatbot with TinyChatEngine',['../index.html#autotoc_md14',1,'']]],
-  ['chatbot_20with_20tinychatengine_20a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_4',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['clip_5fimage_5ff32_5',['clip_image_f32',['../structclip__image__f32.html',1,'']]],
-  ['clip_5fimage_5fu8_6',['clip_image_u8',['../structclip__image__u8.html',1,'']]],
-  ['clip_5fmodel_5fconfig_7',['clip_model_config',['../structclip__model__config.html',1,'']]],
-  ['code_20llama_20demo_20on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_8',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['comparator_9',['comparator',['../structllama__sp__bigram_1_1comparator.html',1,'llama_sp_bigram']]],
-  ['compression_3a_20smoothquant_20and_20awq_10',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md5',1,'']]],
-  ['conv2d_11',['Conv2D',['../classConv2D.html',1,'']]],
-  ['conv2d_5fparams_12',['Conv2D_params',['../structConv2D__params.html',1,'']]],
-  ['cpu_13',['Windows with CPU',['../index.html#autotoc_md10',1,'']]]
+  ['chat_20demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_0',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['chat_20with_20tinychatengine_1',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]],
+  ['clip_5fimage_5ff32_2',['clip_image_f32',['../structclip__image__f32.html',1,'']]],
+  ['clip_5fimage_5fu8_3',['clip_image_u8',['../structclip__image__u8.html',1,'']]],
+  ['clip_5fmodel_5fconfig_4',['clip_model_config',['../structclip__model__config.html',1,'']]],
+  ['code_20llama_20demo_20on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_5',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['comparator_6',['comparator',['../structllama__sp__bigram_1_1comparator.html',1,'llama_sp_bigram']]],
+  ['compression_3a_20smoothquant_20and_20awq_7',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md4',1,'']]],
+  ['conv2d_8',['Conv2D',['../classConv2D.html',1,'']]],
+  ['conv2d_5fparams_9',['Conv2D_params',['../structConv2D__params.html',1,'']]],
+  ['cpu_10',['Windows with CPU',['../index.html#autotoc_md9',1,'']]]
 ];
diff --git a/search/all_7.js b/search/all_7.js
index 5dfffb92..14c40d14 100644
--- a/search/all_7.js
+++ b/search/all_7.js
@@ -1,13 +1,10 @@
 var searchData=
 [
-  ['demo_20a_0',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_1',['Demo on an Apple MacBook Pro M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['demo_20on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_2',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['deploy_20llama2_207b_20chat_20with_20tinychatengine_3',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]],
-  ['deploy_20models_20from_20our_20model_20zoo_4',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]],
-  ['deploy_20speech_20to_20speech_20chatbot_20with_20tinychatengine_20a_20href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_5',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['deploy_20vision_20language_20model_20vlm_20chatbot_20with_20tinychatengine_6',['Deploy vision language model (VLM) chatbot with TinyChatEngine',['../index.html#autotoc_md14',1,'']]],
-  ['device_20llm_20inference_20library_7',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
-  ['device_20specific_20int4_20weight_20reordering_8',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md17',1,'']]],
-  ['download_20and_20deploy_20models_20from_20our_20model_20zoo_9',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]]
+  ['demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_0',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['demo_20on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_1',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['deploy_20llama2_207b_20chat_20with_20tinychatengine_2',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]],
+  ['deploy_20models_20from_20our_20model_20zoo_3',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]],
+  ['device_20llm_20inference_20library_4',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
+  ['device_20specific_20int4_20weight_20reordering_5',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md14',1,'']]],
+  ['download_20and_20deploy_20models_20from_20our_20model_20zoo_6',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]]
 ];
diff --git a/search/all_8.js b/search/all_8.js
index 5ec680df..1b2b2f81 100644
--- a/search/all_8.js
+++ b/search/all_8.js
@@ -2,6 +2,6 @@ var searchData=
 [
   ['embedding_0',['Embedding',['../classEmbedding.html',1,'']]],
   ['encoder_1',['Encoder',['../classEncoder.html',1,'']]],
-  ['engine_3a_20tinychatengine_2',['LLM Inference Engine: TinyChatEngine',['../index.html#autotoc_md6',1,'']]],
-  ['experimental_3',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md11',1,'']]]
+  ['engine_3a_20tinychatengine_2',['LLM Inference Engine: TinyChatEngine',['../index.html#autotoc_md5',1,'']]],
+  ['experimental_3',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md10',1,'']]]
 ];
diff --git a/search/all_9.js b/search/all_9.js
index f0661a0c..015cb4d0 100644
--- a/search/all_9.js
+++ b/search/all_9.js
@@ -48,5 +48,5 @@ var searchData=
   ['fp32optforcausallm_45',['Fp32OPTForCausalLM',['../classFp32OPTForCausalLM.html',1,'']]],
   ['fp32optforcausallm_5finput_46',['Fp32OPTForCausalLM_input',['../structFp32OPTForCausalLM__input.html',1,'']]],
   ['fp32optforcausallm_5foutput_47',['Fp32OPTForCausalLM_output',['../structFp32OPTForCausalLM__output.html',1,'']]],
-  ['from_20our_20model_20zoo_48',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]]
+  ['from_20our_20model_20zoo_48',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]]
 ];
diff --git a/search/all_a.js b/search/all_a.js
index 1cd0f671..30d439d2 100644
--- a/search/all_a.js
+++ b/search/all_a.js
@@ -1,5 +1,5 @@
 var searchData=
 [
   ['geforce_20rtx_204070_20laptop_3a_0',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['gpu_20experimental_1',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md11',1,'']]]
+  ['gpu_20experimental_1',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md10',1,'']]]
 ];
diff --git a/search/all_b.js b/search/all_b.js
index f77101de..0c09b574 100644
--- a/search/all_b.js
+++ b/search/all_b.js
@@ -1,5 +1,52 @@
 var searchData=
 [
-  ['href_20https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_0',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]],
-  ['https_3a_20youtu_20be_20bw5dm3awmna_20si_20ccvzdmq3hwoweqcc_20demo_20a_1',['Deploy speech-to-speech chatbot with TinyChatEngine &lt;a href=&quot;https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC&quot; &gt;[Demo]&lt;/a&gt;',['../index.html#autotoc_md13',1,'']]]
+  ['inference_20engine_3a_20tinychatengine_0',['LLM Inference Engine: TinyChatEngine',['../index.html#autotoc_md5',1,'']]],
+  ['inference_20library_1',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
+  ['int4_20weight_20reordering_2',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md14',1,'']]],
+  ['int4_5fthread_5fargs_3',['int4_thread_args',['../structint4__thread__args.html',1,'']]],
+  ['int4gptbigcodeattention_4',['Int4GPTBigCodeAttention',['../classInt4GPTBigCodeAttention.html',1,'']]],
+  ['int4gptbigcodeattention_5finput_5',['Int4GPTBigCodeAttention_input',['../structInt4GPTBigCodeAttention__input.html',1,'']]],
+  ['int4gptbigcodeattention_5foutput_6',['Int4GPTBigCodeAttention_output',['../structInt4GPTBigCodeAttention__output.html',1,'']]],
+  ['int4gptbigcodedecoder_7',['Int4GPTBigCodeDecoder',['../classInt4GPTBigCodeDecoder.html',1,'']]],
+  ['int4gptbigcodedecoder_5finput_8',['Int4GPTBigCodeDecoder_input',['../structInt4GPTBigCodeDecoder__input.html',1,'']]],
+  ['int4gptbigcodedecoder_5foutput_9',['Int4GPTBigCodeDecoder_output',['../structInt4GPTBigCodeDecoder__output.html',1,'']]],
+  ['int4gptbigcodedecoderlayer_10',['Int4GPTBigCodeDecoderLayer',['../classInt4GPTBigCodeDecoderLayer.html',1,'']]],
+  ['int4gptbigcodedecoderlayer_5finput_11',['Int4GPTBigCodeDecoderLayer_input',['../structInt4GPTBigCodeDecoderLayer__input.html',1,'']]],
+  ['int4gptbigcodedecoderlayer_5foutput_12',['Int4GPTBigCodeDecoderLayer_output',['../structInt4GPTBigCodeDecoderLayer__output.html',1,'']]],
+  ['int4gptbigcodeforcausallm_13',['Int4GPTBigCodeForCausalLM',['../classInt4GPTBigCodeForCausalLM.html',1,'']]],
+  ['int4gptbigcodeforcausallm_5finput_14',['Int4GPTBigCodeForCausalLM_input',['../structInt4GPTBigCodeForCausalLM__input.html',1,'']]],
+  ['int4gptbigcodeforcausallm_5foutput_15',['Int4GPTBigCodeForCausalLM_output',['../structInt4GPTBigCodeForCausalLM__output.html',1,'']]],
+  ['int4llamaattention_16',['Int4llamaAttention',['../classInt4llamaAttention.html',1,'']]],
+  ['int4llamaattention_5finput_17',['Int4llamaAttention_input',['../structInt4llamaAttention__input.html',1,'']]],
+  ['int4llamaattention_5foutput_18',['Int4llamaAttention_output',['../structInt4llamaAttention__output.html',1,'']]],
+  ['int4llamadecoder_19',['Int4llamaDecoder',['../classInt4llamaDecoder.html',1,'']]],
+  ['int4llamadecoder_5finput_20',['Int4llamaDecoder_input',['../structInt4llamaDecoder__input.html',1,'']]],
+  ['int4llamadecoder_5foutput_21',['Int4llamaDecoder_output',['../structInt4llamaDecoder__output.html',1,'']]],
+  ['int4llamadecoderlayer_22',['Int4llamaDecoderLayer',['../classInt4llamaDecoderLayer.html',1,'']]],
+  ['int4llamadecoderlayer_5finput_23',['Int4llamaDecoderLayer_input',['../structInt4llamaDecoderLayer__input.html',1,'']]],
+  ['int4llamadecoderlayer_5foutput_24',['Int4llamaDecoderLayer_output',['../structInt4llamaDecoderLayer__output.html',1,'']]],
+  ['int4llamaforcausallm_25',['Int4LlamaForCausalLM',['../classInt4LlamaForCausalLM.html',1,'']]],
+  ['int4llamaforcausallm_5finput_26',['Int4LlamaForCausalLM_input',['../structInt4LlamaForCausalLM__input.html',1,'']]],
+  ['int4llamaforcausallm_5foutput_27',['Int4LlamaForCausalLM_output',['../structInt4LlamaForCausalLM__output.html',1,'']]],
+  ['int4optattention_28',['Int4OPTAttention',['../classInt4OPTAttention.html',1,'']]],
+  ['int4optattention_5finput_29',['Int4OPTAttention_input',['../structInt4OPTAttention__input.html',1,'']]],
+  ['int4optattention_5foutput_30',['Int4OPTAttention_output',['../structInt4OPTAttention__output.html',1,'']]],
+  ['int4optdecoder_31',['Int4OPTDecoder',['../classInt4OPTDecoder.html',1,'']]],
+  ['int4optdecoder_5finput_32',['Int4OPTDecoder_input',['../structInt4OPTDecoder__input.html',1,'']]],
+  ['int4optdecoder_5foutput_33',['Int4OPTDecoder_output',['../structInt4OPTDecoder__output.html',1,'']]],
+  ['int4optdecoderlayer_34',['Int4OPTDecoderLayer',['../classInt4OPTDecoderLayer.html',1,'']]],
+  ['int4optdecoderlayer_5finput_35',['Int4OPTDecoderLayer_input',['../structInt4OPTDecoderLayer__input.html',1,'']]],
+  ['int4optdecoderlayer_5foutput_36',['Int4OPTDecoderLayer_output',['../structInt4OPTDecoderLayer__output.html',1,'']]],
+  ['int4optforcausallm_37',['Int4OPTForCausalLM',['../classInt4OPTForCausalLM.html',1,'']]],
+  ['int4optforcausallm_5finput_38',['Int4OPTForCausalLM_input',['../structInt4OPTForCausalLM__input.html',1,'']]],
+  ['int4optforcausallm_5foutput_39',['Int4OPTForCausalLM_output',['../structInt4OPTForCausalLM__output.html',1,'']]],
+  ['int8optattention_40',['Int8OPTAttention',['../classInt8OPTAttention.html',1,'']]],
+  ['int8optattention_5finput_41',['Int8OPTAttention_input',['../structInt8OPTAttention__input.html',1,'']]],
+  ['int8optattention_5foutput_42',['Int8OPTAttention_output',['../structInt8OPTAttention__output.html',1,'']]],
+  ['int8optdecoder_43',['Int8OPTDecoder',['../classInt8OPTDecoder.html',1,'']]],
+  ['int8optdecoder_5finput_44',['Int8OPTDecoder_input',['../structInt8OPTDecoder__input.html',1,'']]],
+  ['int8optdecoder_5foutput_45',['Int8OPTDecoder_output',['../structInt8OPTDecoder__output.html',1,'']]],
+  ['int8optdecoderlayer_46',['Int8OPTDecoderLayer',['../classInt8OPTDecoderLayer.html',1,'']]],
+  ['int8optdecoderlayer_5finput_47',['Int8OPTDecoderLayer_input',['../structInt8OPTDecoderLayer__input.html',1,'']]],
+  ['int8optdecoderlayer_5foutput_48',['Int8OPTDecoderLayer_output',['../structInt8OPTDecoderLayer__output.html',1,'']]]
 ];
diff --git a/search/all_c.js b/search/all_c.js
index 0c8d295d..1a7f110b 100644
--- a/search/all_c.js
+++ b/search/all_c.js
@@ -1,52 +1,24 @@
 var searchData=
 [
-  ['inference_20engine_3a_20tinychatengine_0',['LLM Inference Engine: TinyChatEngine',['../index.html#autotoc_md6',1,'']]],
-  ['inference_20library_1',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
-  ['int4_20weight_20reordering_2',['Device-specific int4 Weight Reordering',['../index.html#autotoc_md17',1,'']]],
-  ['int4_5fthread_5fargs_3',['int4_thread_args',['../structint4__thread__args.html',1,'']]],
-  ['int4gptbigcodeattention_4',['Int4GPTBigCodeAttention',['../classInt4GPTBigCodeAttention.html',1,'']]],
-  ['int4gptbigcodeattention_5finput_5',['Int4GPTBigCodeAttention_input',['../structInt4GPTBigCodeAttention__input.html',1,'']]],
-  ['int4gptbigcodeattention_5foutput_6',['Int4GPTBigCodeAttention_output',['../structInt4GPTBigCodeAttention__output.html',1,'']]],
-  ['int4gptbigcodedecoder_7',['Int4GPTBigCodeDecoder',['../classInt4GPTBigCodeDecoder.html',1,'']]],
-  ['int4gptbigcodedecoder_5finput_8',['Int4GPTBigCodeDecoder_input',['../structInt4GPTBigCodeDecoder__input.html',1,'']]],
-  ['int4gptbigcodedecoder_5foutput_9',['Int4GPTBigCodeDecoder_output',['../structInt4GPTBigCodeDecoder__output.html',1,'']]],
-  ['int4gptbigcodedecoderlayer_10',['Int4GPTBigCodeDecoderLayer',['../classInt4GPTBigCodeDecoderLayer.html',1,'']]],
-  ['int4gptbigcodedecoderlayer_5finput_11',['Int4GPTBigCodeDecoderLayer_input',['../structInt4GPTBigCodeDecoderLayer__input.html',1,'']]],
-  ['int4gptbigcodedecoderlayer_5foutput_12',['Int4GPTBigCodeDecoderLayer_output',['../structInt4GPTBigCodeDecoderLayer__output.html',1,'']]],
-  ['int4gptbigcodeforcausallm_13',['Int4GPTBigCodeForCausalLM',['../classInt4GPTBigCodeForCausalLM.html',1,'']]],
-  ['int4gptbigcodeforcausallm_5finput_14',['Int4GPTBigCodeForCausalLM_input',['../structInt4GPTBigCodeForCausalLM__input.html',1,'']]],
-  ['int4gptbigcodeforcausallm_5foutput_15',['Int4GPTBigCodeForCausalLM_output',['../structInt4GPTBigCodeForCausalLM__output.html',1,'']]],
-  ['int4llamaattention_16',['Int4llamaAttention',['../classInt4llamaAttention.html',1,'']]],
-  ['int4llamaattention_5finput_17',['Int4llamaAttention_input',['../structInt4llamaAttention__input.html',1,'']]],
-  ['int4llamaattention_5foutput_18',['Int4llamaAttention_output',['../structInt4llamaAttention__output.html',1,'']]],
-  ['int4llamadecoder_19',['Int4llamaDecoder',['../classInt4llamaDecoder.html',1,'']]],
-  ['int4llamadecoder_5finput_20',['Int4llamaDecoder_input',['../structInt4llamaDecoder__input.html',1,'']]],
-  ['int4llamadecoder_5foutput_21',['Int4llamaDecoder_output',['../structInt4llamaDecoder__output.html',1,'']]],
-  ['int4llamadecoderlayer_22',['Int4llamaDecoderLayer',['../classInt4llamaDecoderLayer.html',1,'']]],
-  ['int4llamadecoderlayer_5finput_23',['Int4llamaDecoderLayer_input',['../structInt4llamaDecoderLayer__input.html',1,'']]],
-  ['int4llamadecoderlayer_5foutput_24',['Int4llamaDecoderLayer_output',['../structInt4llamaDecoderLayer__output.html',1,'']]],
-  ['int4llamaforcausallm_25',['Int4LlamaForCausalLM',['../classInt4LlamaForCausalLM.html',1,'']]],
-  ['int4llamaforcausallm_5finput_26',['Int4LlamaForCausalLM_input',['../structInt4LlamaForCausalLM__input.html',1,'']]],
-  ['int4llamaforcausallm_5foutput_27',['Int4LlamaForCausalLM_output',['../structInt4LlamaForCausalLM__output.html',1,'']]],
-  ['int4optattention_28',['Int4OPTAttention',['../classInt4OPTAttention.html',1,'']]],
-  ['int4optattention_5finput_29',['Int4OPTAttention_input',['../structInt4OPTAttention__input.html',1,'']]],
-  ['int4optattention_5foutput_30',['Int4OPTAttention_output',['../structInt4OPTAttention__output.html',1,'']]],
-  ['int4optdecoder_31',['Int4OPTDecoder',['../classInt4OPTDecoder.html',1,'']]],
-  ['int4optdecoder_5finput_32',['Int4OPTDecoder_input',['../structInt4OPTDecoder__input.html',1,'']]],
-  ['int4optdecoder_5foutput_33',['Int4OPTDecoder_output',['../structInt4OPTDecoder__output.html',1,'']]],
-  ['int4optdecoderlayer_34',['Int4OPTDecoderLayer',['../classInt4OPTDecoderLayer.html',1,'']]],
-  ['int4optdecoderlayer_5finput_35',['Int4OPTDecoderLayer_input',['../structInt4OPTDecoderLayer__input.html',1,'']]],
-  ['int4optdecoderlayer_5foutput_36',['Int4OPTDecoderLayer_output',['../structInt4OPTDecoderLayer__output.html',1,'']]],
-  ['int4optforcausallm_37',['Int4OPTForCausalLM',['../classInt4OPTForCausalLM.html',1,'']]],
-  ['int4optforcausallm_5finput_38',['Int4OPTForCausalLM_input',['../structInt4OPTForCausalLM__input.html',1,'']]],
-  ['int4optforcausallm_5foutput_39',['Int4OPTForCausalLM_output',['../structInt4OPTForCausalLM__output.html',1,'']]],
-  ['int8optattention_40',['Int8OPTAttention',['../classInt8OPTAttention.html',1,'']]],
-  ['int8optattention_5finput_41',['Int8OPTAttention_input',['../structInt8OPTAttention__input.html',1,'']]],
-  ['int8optattention_5foutput_42',['Int8OPTAttention_output',['../structInt8OPTAttention__output.html',1,'']]],
-  ['int8optdecoder_43',['Int8OPTDecoder',['../classInt8OPTDecoder.html',1,'']]],
-  ['int8optdecoder_5finput_44',['Int8OPTDecoder_input',['../structInt8OPTDecoder__input.html',1,'']]],
-  ['int8optdecoder_5foutput_45',['Int8OPTDecoder_output',['../structInt8OPTDecoder__output.html',1,'']]],
-  ['int8optdecoderlayer_46',['Int8OPTDecoderLayer',['../classInt8OPTDecoderLayer.html',1,'']]],
-  ['int8optdecoderlayer_5finput_47',['Int8OPTDecoderLayer_input',['../structInt8OPTDecoderLayer__input.html',1,'']]],
-  ['int8optdecoderlayer_5foutput_48',['Int8OPTDecoderLayer_output',['../structInt8OPTDecoderLayer__output.html',1,'']]]
+  ['laptop_3a_0',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['layernorm_1',['LayerNorm',['../classLayerNorm.html',1,'']]],
+  ['layernorm_5fparams_2',['LayerNorm_params',['../structLayerNorm__params.html',1,'']]],
+  ['layernormq_3',['LayerNormQ',['../classLayerNormQ.html',1,'']]],
+  ['layernormq_5fparams_4',['LayerNormQ_params',['../structLayerNormQ__params.html',1,'']]],
+  ['library_5',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
+  ['linear_5ffp_6',['Linear_FP',['../classLinear__FP.html',1,'']]],
+  ['linear_5ffp_5fint4_7',['Linear_FP_int4',['../classLinear__FP__int4.html',1,'']]],
+  ['llama_20chat_20demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_8',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['llama_20demo_20on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_9',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['llama2_207b_20chat_20with_20tinychatengine_10',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md11',1,'']]],
+  ['llama_5ffile_11',['llama_file',['../structllama__file.html',1,'']]],
+  ['llama_5fsp_5fbigram_12',['llama_sp_bigram',['../structllama__sp__bigram.html',1,'']]],
+  ['llama_5fsp_5fsymbol_13',['llama_sp_symbol',['../structllama__sp__symbol.html',1,'']]],
+  ['llama_5ftokenizer_14',['llama_tokenizer',['../structllama__tokenizer.html',1,'']]],
+  ['llama_5fvocab_15',['llama_vocab',['../structllama__vocab.html',1,'']]],
+  ['llamarmsnorm_16',['LlamaRMSNorm',['../classLlamaRMSNorm.html',1,'']]],
+  ['llava_5fimage_5fembed_17',['llava_image_embed',['../structllava__image__embed.html',1,'']]],
+  ['llm_20compression_3a_20smoothquant_20and_20awq_18',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md4',1,'']]],
+  ['llm_20inference_20engine_3a_20tinychatengine_19',['LLM Inference Engine: TinyChatEngine',['../index.html#autotoc_md5',1,'']]],
+  ['llm_20inference_20library_20',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]]
 ];
diff --git a/search/all_d.js b/search/all_d.js
index 0692b4fa..09a44eea 100644
--- a/search/all_d.js
+++ b/search/all_d.js
@@ -1,25 +1,25 @@
 var searchData=
 [
-  ['language_20model_20vlm_20chatbot_20with_20tinychatengine_0',['Deploy vision language model (VLM) chatbot with TinyChatEngine',['../index.html#autotoc_md14',1,'']]],
-  ['laptop_3a_1',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['layernorm_2',['LayerNorm',['../classLayerNorm.html',1,'']]],
-  ['layernorm_5fparams_3',['LayerNorm_params',['../structLayerNorm__params.html',1,'']]],
-  ['layernormq_4',['LayerNormQ',['../classLayerNormQ.html',1,'']]],
-  ['layernormq_5fparams_5',['LayerNormQ_params',['../structLayerNormQ__params.html',1,'']]],
-  ['library_6',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
-  ['linear_5ffp_7',['Linear_FP',['../classLinear__FP.html',1,'']]],
-  ['linear_5ffp_5fint4_8',['Linear_FP_int4',['../classLinear__FP__int4.html',1,'']]],
-  ['llama_20chat_20demo_20on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_9',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md3',1,'']]],
-  ['llama_20demo_20on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_10',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['llama2_207b_20chat_20with_20tinychatengine_11',['Step-by-step to Deploy LLaMA2-7B-chat with TinyChatEngine',['../index.html#autotoc_md12',1,'']]],
-  ['llama_5ffile_12',['llama_file',['../structllama__file.html',1,'']]],
-  ['llama_5fsp_5fbigram_13',['llama_sp_bigram',['../structllama__sp__bigram.html',1,'']]],
-  ['llama_5fsp_5fsymbol_14',['llama_sp_symbol',['../structllama__sp__symbol.html',1,'']]],
-  ['llama_5ftokenizer_15',['llama_tokenizer',['../structllama__tokenizer.html',1,'']]],
-  ['llama_5fvocab_16',['llama_vocab',['../structllama__vocab.html',1,'']]],
-  ['llamarmsnorm_17',['LlamaRMSNorm',['../classLlamaRMSNorm.html',1,'']]],
-  ['llava_5fimage_5fembed_18',['llava_image_embed',['../structllava__image__embed.html',1,'']]],
-  ['llm_20compression_3a_20smoothquant_20and_20awq_19',['LLM Compression: SmoothQuant and AWQ',['../index.html#autotoc_md5',1,'']]],
-  ['llm_20inference_20engine_3a_20tinychatengine_20',['LLM Inference Engine: TinyChatEngine',['../index.html#autotoc_md6',1,'']]],
-  ['llm_20inference_20library_21',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]]
+  ['m1_202021_20_3a_0',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['macbook_20pro_20m1_202021_20_3a_1',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['macos_2',['MacOS',['../index.html#autotoc_md8',1,'']]],
+  ['matmul_5fparams_3',['matmul_params',['../structmatmul__params.html',1,'']]],
+  ['matmuloperator_4',['MatmulOperator',['../classmatmul_1_1MatmulOperator.html',1,'matmul']]],
+  ['matrix_5',['matrix',['../structmatrix.html',1,'']]],
+  ['matrix3d_6',['Matrix3D',['../classMatrix3D.html',1,'']]],
+  ['matrix3d_3c_20float_20_3e_7',['Matrix3D&lt; float &gt;',['../classMatrix3D.html',1,'']]],
+  ['matrix3d_3c_20int_20_3e_8',['Matrix3D&lt; int &gt;',['../classMatrix3D.html',1,'']]],
+  ['matrix3d_3c_20int8_5ft_20_3e_9',['Matrix3D&lt; int8_t &gt;',['../classMatrix3D.html',1,'']]],
+  ['matrix3d_3c_20uint8_5ft_20_3e_10',['Matrix3D&lt; uint8_t &gt;',['../classMatrix3D.html',1,'']]],
+  ['matrix4d_11',['Matrix4D',['../classMatrix4D.html',1,'']]],
+  ['matrix4d_3c_20float_20_3e_12',['Matrix4D&lt; float &gt;',['../classMatrix4D.html',1,'']]],
+  ['max_5ferror_5finfo_13',['max_error_info',['../structmax__error__info.html',1,'']]],
+  ['metalmatmulbuffers_14',['MetalMatmulBuffers',['../structMetalMatmulBuffers.html',1,'']]],
+  ['metalmatmulint4_15',['MetalMatmulInt4',['../classMetalMatmulInt4.html',1,'']]],
+  ['metalmatmulint4imp_16',['MetalMatmulInt4IMP',['../classMetalMatmulInt4IMP.html',1,'']]],
+  ['metalmatmulparams_17',['MetalMatMulParams',['../structMetalMatMulParams.html',1,'']]],
+  ['model_20support_18',['Quantization and Model Support',['../index.html#autotoc_md13',1,'']]],
+  ['model_20zoo_19',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]],
+  ['model_5fconfig_20',['model_config',['../structmodel__config.html',1,'']]],
+  ['models_20from_20our_20model_20zoo_21',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]]
 ];
diff --git a/search/all_e.js b/search/all_e.js
index 2f7a66f4..374ed977 100644
--- a/search/all_e.js
+++ b/search/all_e.js
@@ -1,26 +1,6 @@
 var searchData=
 [
-  ['m1_202021_20_3a_0',['M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['macbook_20pro_20m1_202021_20_3a_1',['MacBook Pro M1 2021 :',['../index.html#autotoc_md3',1,'LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):'],['../index.html#autotoc_md2',1,'VLM Demo on an Apple MacBook Pro (M1, 2021):']]],
-  ['macos_2',['MacOS',['../index.html#autotoc_md9',1,'']]],
-  ['matmul_5fparams_3',['matmul_params',['../structmatmul__params.html',1,'']]],
-  ['matmuloperator_4',['MatmulOperator',['../classmatmul_1_1MatmulOperator.html',1,'matmul']]],
-  ['matrix_5',['matrix',['../structmatrix.html',1,'']]],
-  ['matrix3d_6',['Matrix3D',['../classMatrix3D.html',1,'']]],
-  ['matrix3d_3c_20float_20_3e_7',['Matrix3D&lt; float &gt;',['../classMatrix3D.html',1,'']]],
-  ['matrix3d_3c_20int_20_3e_8',['Matrix3D&lt; int &gt;',['../classMatrix3D.html',1,'']]],
-  ['matrix3d_3c_20int8_5ft_20_3e_9',['Matrix3D&lt; int8_t &gt;',['../classMatrix3D.html',1,'']]],
-  ['matrix3d_3c_20uint8_5ft_20_3e_10',['Matrix3D&lt; uint8_t &gt;',['../classMatrix3D.html',1,'']]],
-  ['matrix4d_11',['Matrix4D',['../classMatrix4D.html',1,'']]],
-  ['matrix4d_3c_20float_20_3e_12',['Matrix4D&lt; float &gt;',['../classMatrix4D.html',1,'']]],
-  ['max_5ferror_5finfo_13',['max_error_info',['../structmax__error__info.html',1,'']]],
-  ['metalmatmulbuffers_14',['MetalMatmulBuffers',['../structMetalMatmulBuffers.html',1,'']]],
-  ['metalmatmulint4_15',['MetalMatmulInt4',['../classMetalMatmulInt4.html',1,'']]],
-  ['metalmatmulint4imp_16',['MetalMatmulInt4IMP',['../classMetalMatmulInt4IMP.html',1,'']]],
-  ['metalmatmulparams_17',['MetalMatMulParams',['../structMetalMatMulParams.html',1,'']]],
-  ['model_20support_18',['Quantization and Model Support',['../index.html#autotoc_md16',1,'']]],
-  ['model_20vlm_20chatbot_20with_20tinychatengine_19',['Deploy vision language model (VLM) chatbot with TinyChatEngine',['../index.html#autotoc_md14',1,'']]],
-  ['model_20zoo_20',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]],
-  ['model_5fconfig_21',['model_config',['../structmodel__config.html',1,'']]],
-  ['models_20from_20our_20model_20zoo_22',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md18',1,'']]]
+  ['news_0',['News',['../index.html#autotoc_md6',1,'']]],
+  ['nvidia_20geforce_20rtx_204070_20laptop_3a_1',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['nvidia_20gpu_20experimental_2',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md10',1,'']]]
 ];
diff --git a/search/all_f.js b/search/all_f.js
index 14e25995..587d16c8 100644
--- a/search/all_f.js
+++ b/search/all_f.js
@@ -1,6 +1,15 @@
 var searchData=
 [
-  ['news_0',['News',['../index.html#autotoc_md7',1,'']]],
-  ['nvidia_20geforce_20rtx_204070_20laptop_3a_1',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
-  ['nvidia_20gpu_20experimental_2',['Windows with Nvidia GPU (Experimental)',['../index.html#autotoc_md11',1,'']]]
+  ['on_20an_20apple_20macbook_20pro_20m1_202021_20_3a_0',['LLaMA Chat Demo on an Apple MacBook Pro (M1, 2021):',['../index.html#autotoc_md2',1,'']]],
+  ['on_20an_20nvidia_20geforce_20rtx_204070_20laptop_3a_1',['Code LLaMA Demo on an NVIDIA GeForce RTX 4070 laptop:',['../index.html#autotoc_md1',1,'']]],
+  ['on_20device_20llm_20inference_20library_2',['TinyChatEngine: On-Device LLM Inference Library',['../index.html#autotoc_md0',1,'']]],
+  ['opt_5fparams_3',['opt_params',['../structopt__params.html',1,'']]],
+  ['opt_5ftoken_5fdata_4',['OPT_token_data',['../structOPT__token__data.html',1,'']]],
+  ['opt_5ftoken_5fdata_5farray_5',['OPT_token_data_array',['../structOPT__token__data__array.html',1,'']]],
+  ['optforcausallm_6',['OPTForCausalLM',['../classOPTForCausalLM.html',1,'']]],
+  ['optforcausallm_5finput_7',['OPTForCausalLM_input',['../structOPTForCausalLM__input.html',1,'']]],
+  ['optforcausallm_5foutput_8',['OPTForCausalLM_output',['../structOPTForCausalLM__output.html',1,'']]],
+  ['optimization_5fparams_9',['optimization_params',['../structoptimization__params.html',1,'']]],
+  ['our_20model_20zoo_10',['Download and Deploy Models from our Model Zoo',['../index.html#autotoc_md15',1,'']]],
+  ['overview_11',['Overview',['../index.html#autotoc_md3',1,'']]]
 ];
diff --git a/search/searchdata.js b/search/searchdata.js
index 0ca24069..37442811 100644
--- a/search/searchdata.js
+++ b/search/searchdata.js
@@ -1,6 +1,6 @@
 var indexSectionsWithContent =
 {
-  0: "247:abcdefghilmnopqrstvwyz",
+  0: "247:abcdefgilmnopqrstwz",
   1: "abcefilmopqrstw",
   2: "p",
   3: "p",
diff --git a/vlm_demo_m1.gif b/vlm_demo_m1.gif
deleted file mode 100644
index 7e3bfa2f..00000000
Binary files a/vlm_demo_m1.gif and /dev/null differ