Skip to content

Commit

Permalink
Work in progress.
Browse files Browse the repository at this point in the history
Added falcon main and library based on llama.cpp
CPU inference works (getting ~260ms/token on 7B 16 bit falcon)
Tested with 7B 16 bit and the two shakespear models (both in 16 bit precisiononly)

TODO/WIP:
1) quantization runs, creates a ggjt 3 file but something is wrong with the quantized model binary
- even quantization from 16 -> 16 also fails, something is wrong in the tensors produced
2) mmap should work with quantized binaries once 1) is solved
3) CUDA support is mostly there, it's currently disabled (all CPU backend)
4) memory/context caluculations are off, GPU memory calculations are wrong either
5) the python conversion script is pre GGML 1 version (tokens without scores)
6) some stuff is still called "llama", some of it should be renamed to a generic name as it works for both
7) the GGML produced by the current python uses an old ftype method

Makfiles:
cmake on windows with build tools works
the makefile for linux/msys was blind adjusted but not tested yet - possibly missed something

Changes to the codebase:
* repeat2 has been added to ggml (jploski - ggerganov/ggml#231) including the backward variant (untested, probably fails)
* minor changes to work with falcon (name length)
* libfalcon is the previous "llama.cpp" and falcon_main is the previous main.cpp
  • Loading branch information
John authored and JohannesGaessler committed Jun 17, 2023
1 parent b241649 commit fee7da1
Show file tree
Hide file tree
Showing 17 changed files with 6,570 additions and 9 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,6 @@ qnt-*.txt
perf-*.txt

examples/jeopardy/results.txt
demo_falcon_orig.cpp
.github/workflows/build.yml
.github/workflows/build.yml
39 changes: 34 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
project("llama.cpp" C CXX)

# If CUDA toolkit is not found using msvc compiler switch to Community Edition (same compiler, just other kit..)
project("ggllm.cpp" C CXX)
# add_definitions(-DGGML_PERF=1)
include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/include")
include_directories("C:/program files/NVIDIA GPU Computing Toolkit/CUDA/v12.0/lib/x64")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
Expand All @@ -20,7 +23,7 @@ else()
endif()

if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
set(BUILD_SHARED_LIBS_DEFAULT OFF)

option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
else()
Expand Down Expand Up @@ -67,7 +70,7 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
Expand Down Expand Up @@ -225,13 +228,14 @@ if (LLAMA_BLAS)
endif()

if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")

enable_language(CUDA)
message(STATUS "CUDA found, version: ${CUDAToolkit_VERSION}")

set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

Expand Down Expand Up @@ -480,19 +484,44 @@ target_link_libraries(llama PRIVATE
${LLAMA_EXTRA_LIBS}
)

# falcon
add_library(libfalcon
libfalcon.cpp
libfalcon.h
llama-util.h
)
target_include_directories(libfalcon PUBLIC .)
target_compile_features(libfalcon PUBLIC cxx_std_11) # don't bump
target_link_libraries(libfalcon PRIVATE
ggml
${LLAMA_EXTRA_LIBS}
)
#

if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
if (LLAMA_METAL)
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif()

# falcon
set_target_properties(libfalcon PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(libfalcon PRIVATE LLAMA_SHARED LLAMA_BUILD)
if (LLAMA_METAL)
set_target_properties(libfalcon PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif()
#
endif()

if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
# falcon
set_property(TARGET libfalcon PROPERTY CUDA_ARCHITECTURES OFF)

endif()


Expand Down
11 changes: 11 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,15 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

libfalcon.o: libfalcon.cpp ggml.h ggml-cuda.h libfalcon.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@

falcom_common.o: examples/falcon_common.cpp examples/falcon_common.h
$(CXX) $(CXXFLAGS) -c $< -o $@

libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

Expand All @@ -280,6 +286,9 @@ simple: examples/simple/simple.cpp build-info.h ggml.
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

falcon_quantize: examples/falcon_quantize/quantize.cpp build-info.h ggml.o libfalcon.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

Expand All @@ -306,6 +315,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
rm $@.tmp; \
fi

falcon_main: examples/falcon/falcon_main.cpp build-info.h ggml.o libfalcon.o falcon_common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
#
# Tests
#
Expand Down
22 changes: 21 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ find_package(Threads REQUIRED)

# ...

# common
# common

set(TARGET common)

Expand All @@ -23,13 +23,33 @@ target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)


# falcon_common

set(FALCON_TARGET falcon_common)

add_library(${FALCON_TARGET} OBJECT
falcon_common.h
falcon_common.cpp
)

if (BUILD_SHARED_LIBS)
set_target_properties(${FALCON_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()

target_include_directories(${FALCON_TARGET} PUBLIC .)
target_compile_features(${FALCON_TARGET} PUBLIC cxx_std_11)
target_link_libraries(${FALCON_TARGET} PRIVATE libfalcon)

# examples

include_directories(${CMAKE_CURRENT_SOURCE_DIR})

if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(falcon)
add_subdirectory(falcon_quantize)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
Expand Down
8 changes: 8 additions & 0 deletions examples/falcon/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
set(TARGET falcon_main)
add_executable(${TARGET} falcon_main.cpp)
target_link_libraries(${TARGET} PRIVATE falcon_common libfalcon ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

Loading

0 comments on commit fee7da1

Please sign in to comment.