Work in progress.

Added falcon main and library based on llama.cpp CPU inference works (getting ~260ms/token on 7B 16 bit falcon) Tested with 7B 16 bit and the two shakespear models (both in 16 bit precisiononly) TODO/WIP: 1) quantization runs, creates a ggjt 3 file but something is wrong with the quantized model binary - even quantization from 16 -> 16 also fails, something is wrong in the tensors produced 2) mmap should work with quantized binaries once 1) is solved 3) CUDA support is mostly there, it's currently disabled (all CPU backend) 4) memory/context caluculations are off, GPU memory calculations are wrong either 5) the python conversion script is pre GGML 1 version (tokens without scores) 6) some stuff is still called "llama", some of it should be renamed to a generic name as it works for both 7) the GGML produced by the current python uses an old ftype method Makfiles: cmake on windows with build tools works the makefile for linux/msys was blind adjusted but not tested yet - possibly missed something Changes to the codebase: * repeat2 has been added to ggml (jploski - ggerganov/ggml#231) including the backward variant (untested, probably fails) * minor changes to work with falcon (name length) * libfalcon is the previous "llama.cpp" and falcon_main is the previous main.cpp
JohannesGaessler · Jun 17, 2023 · fee7da1 · fee7da1
1 parent b241649
commit fee7da1
Show file tree

Hide file tree

Showing 17 changed files with 6,570 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -56,3 +56,6 @@ qnt-*.txt
 perf-*.txt
 
 examples/jeopardy/results.txt
+demo_falcon_orig.cpp
+.github/workflows/build.yml
+.github/workflows/build.yml
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,9 @@
 cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
-project("llama.cpp" C CXX)
-
+# If CUDA toolkit is not found using msvc compiler switch to Community Edition (same compiler, just other kit..)
+project("ggllm.cpp" C CXX)
+# add_definitions(-DGGML_PERF=1)
+include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
+include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@@ -20,7 +23,7 @@ else()
 endif()
 
 if (EMSCRIPTEN)
-    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF) 
 
     option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
 else()
@@ -67,7 +70,7 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                ON)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
@@ -225,13 +228,14 @@ if (LLAMA_BLAS)
 endif()
 
 if (LLAMA_CUBLAS)
-    cmake_minimum_required(VERSION 3.17)
+    cmake_minimum_required(VERSION 3.17) 
 
     find_package(CUDAToolkit)
     if (CUDAToolkit_FOUND)
         message(STATUS "cuBLAS found")
 
         enable_language(CUDA)
+        message(STATUS "CUDA found, version: ${CUDAToolkit_VERSION}")
 
         set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
 
@@ -480,19 +484,44 @@ target_link_libraries(llama PRIVATE
     ${LLAMA_EXTRA_LIBS}
     )
 
+# falcon
+add_library(libfalcon
+    libfalcon.cpp
+    libfalcon.h
+    llama-util.h
+    )
+target_include_directories(libfalcon PUBLIC .)
+target_compile_features(libfalcon PUBLIC cxx_std_11) # don't bump
+target_link_libraries(libfalcon PRIVATE
+    ggml
+    ${LLAMA_EXTRA_LIBS}
+    )
+#
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
     if (LLAMA_METAL)
         set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
     endif()
+
+    # falcon
+    set_target_properties(libfalcon PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(libfalcon PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    if (LLAMA_METAL)
+        set_target_properties(libfalcon PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    endif()
+    #
 endif()
 
 if (GGML_SOURCES_CUDA)
     message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
     set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES OFF)
     set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
     set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
+    # falcon
+    set_property(TARGET libfalcon  PROPERTY CUDA_ARCHITECTURES OFF)
+
 endif()
 
 

diff --git a/Makefile b/Makefile
@@ -255,9 +255,15 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+libfalcon.o: libfalcon.cpp ggml.h ggml-cuda.h libfalcon.h llama-util.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+falcom_common.o: examples/falcon_common.cpp examples/falcon_common.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
@@ -280,6 +286,9 @@ simple: examples/simple/simple.cpp                            build-info.h ggml.
 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+falcon_quantize: examples/falcon_quantize/quantize.cpp                      build-info.h ggml.o libfalcon.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
@@ -306,6 +315,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi
 
+falcon_main: examples/falcon/falcon_main.cpp                                  build-info.h ggml.o libfalcon.o falcon_common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 #
 # Tests
 #

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -6,7 +6,7 @@ find_package(Threads REQUIRED)
 
 # ...
 
-# common
+# common 
 
 set(TARGET common)
 
@@ -23,13 +23,33 @@ target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
 target_link_libraries(${TARGET} PRIVATE llama)
 
+
+# falcon_common
+
+set(FALCON_TARGET falcon_common)
+
+add_library(${FALCON_TARGET} OBJECT
+    falcon_common.h
+    falcon_common.cpp
+    )
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${FALCON_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${FALCON_TARGET} PUBLIC .)
+target_compile_features(${FALCON_TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${FALCON_TARGET} PRIVATE libfalcon)
+
 # examples
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
     add_subdirectory(main)
+    add_subdirectory(falcon) 
+    add_subdirectory(falcon_quantize)
     add_subdirectory(quantize)
     add_subdirectory(quantize-stats)
     add_subdirectory(perplexity)

diff --git a/examples/falcon/CMakeLists.txt b/examples/falcon/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET falcon_main)
+add_executable(${TARGET} falcon_main.cpp)
+target_link_libraries(${TARGET} PRIVATE falcon_common libfalcon ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
+