AAbushady
diff --git a/‎CMakeLists.txt
+14-13 b/‎CMakeLists.txt
+14-13
diff --git a/‎Makefile
+1-1 b/‎Makefile
+1-1
diff --git a/‎examples/benchmark/benchmark-matmult.cpp
+3-3 b/‎examples/benchmark/benchmark-matmult.cpp
+3-3
diff --git a/‎examples/common.cpp
+62-2 b/‎examples/common.cpp
+62-2
diff --git a/‎examples/common.h
+6 b/‎examples/common.h
+6
diff --git a/‎examples/embedding/embedding.cpp
+5-19 b/‎examples/embedding/embedding.cpp
+5-19
diff --git a/‎examples/main/README.md
+1-1 b/‎examples/main/README.md
+1-1
diff --git a/‎examples/main/main.cpp
+22-48 b/‎examples/main/main.cpp
+22-48
@@ -77,21 +77,19 @@ option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 # Build info header
 #
 
-# Write header template to binary dir to keep source directory clean
-file(WRITE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in" "\
-#ifndef BUILD_INFO_H\n\
-#define BUILD_INFO_H\n\
-\n\
-#define BUILD_NUMBER @BUILD_NUMBER@\n\
-#define BUILD_COMMIT \"@BUILD_COMMIT@\"\n\
-\n\
-#endif // BUILD_INFO_H\n\
-")
-
 # Generate initial build-info.h
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+
+    # Is git submodule
+    if(NOT IS_DIRECTORY "${GIT_DIR}")
+        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
+        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
+    endif()
+
     # Add a custom target for build-info.h
     add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
 
@@ -101,7 +99,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
         COMMENT "Generating build details from Git"
         COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/.git/index"
+        DEPENDS "${GIT_DIR}/index"
         VERBATIM
     )
 else()
@@ -389,8 +387,11 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
             add_compile_options(-mavx512vnni)
         endif()
     endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PowerPC detected")
+    add_compile_options(-mcpu=native -mtune=native)
+    #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
 else()
-    # TODO: support PowerPC
     message(STATUS "Unknown architecture")
 endif()
 
 
@@ -226,7 +226,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
-	@scripts/build-info.sh > $@.tmp
+	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
 	else \
 
@@ -38,9 +38,9 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
 
 #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
 
-#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
         TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
-        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
+        (int) TENSOR->ne[0], (int) TENSOR->ne[1], (int) TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
     { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
 
 struct benchmark_params_struct {
@@ -138,7 +138,7 @@ int main(int argc, char ** argv)  {
     ctx = ggml_init(params);
     if (!ctx) {
         fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-        return false;
+        return 1;
     }
 
 
 
@@ -66,6 +66,33 @@ int32_t get_num_physical_cores() {
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
+std::string process_escapes(const char* input) {
+    std::string output;
+
+    if (input != nullptr) {
+        std::size_t input_len = std::strlen(input);
+        output.reserve(input_len);
+
+        for (std::size_t i = 0; i < input_len; ++i) {
+            if (input[i] == '\\' && i + 1 < input_len) {
+                switch (input[++i]) {
+                    case 'n':  output.push_back('\n'); break;
+                    case 't':  output.push_back('\t'); break;
+                    case '\'': output.push_back('\''); break;
+                    case '\"': output.push_back('\"'); break;
+                    case '\\': output.push_back('\\'); break;
+                    default:   output.push_back('\\');
+                               output.push_back(input[i]); break;
+                }
+            } else {
+                output.push_back(input[i]);
+            }
+        }
+    }
+
+    return output;
+}
+
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
@@ -91,7 +118,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.prompt = argv[i];
+            params.prompt = process_escapes(argv[i]);
         } else if (arg == "--session") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -324,7 +351,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
     fprintf(stderr, "                        specified more than once for multiple prompts).\n");
     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
@@ -405,6 +432,39 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
     return res;
 }
 
+struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
+    auto lparams = llama_context_default_params();
+
+    lparams.n_ctx      = params.n_ctx;
+    lparams.n_parts    = params.n_parts;
+    lparams.seed       = params.seed;
+    lparams.f16_kv     = params.memory_f16;
+    lparams.use_mmap   = params.use_mmap;
+    lparams.use_mlock  = params.use_mlock;
+    lparams.logits_all = params.perplexity;
+    lparams.embedding  = params.embedding;
+
+    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
+
+    if (lctx == NULL) {
+        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        return NULL;
+    }
+
+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(lctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return NULL;
+        }
+    }
+
+    return lctx;
+}
+
 /* Keep track of current color of output, and emit ANSI code if it changes. */
 void set_console_color(console_state & con_st, console_color_t color) {
     if (con_st.use_color && con_st.color != color) {
 
@@ -77,6 +77,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
 
+//
+// Model utils
+//
+
+struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
+
 //
 // Console utils
 //
 
@@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
-    if (params.seed <= 0) {
+    if (params.seed < 0) {
         params.seed = time(NULL);
     }
 
@@ -35,24 +35,10 @@ int main(int argc, char ** argv) {
     llama_context * ctx;
 
     // load the model
-    {
-        auto lparams = llama_context_default_params();
-
-        lparams.n_ctx      = params.n_ctx;
-        lparams.n_parts    = params.n_parts;
-        lparams.seed       = params.seed;
-        lparams.f16_kv     = params.memory_f16;
-        lparams.logits_all = params.perplexity;
-        lparams.use_mmap   = params.use_mmap;
-        lparams.use_mlock  = params.use_mlock;
-        lparams.embedding  = params.embedding;
-
-        ctx = llama_init_from_file(params.model.c_str(), lparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
+    ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
     }
 
     // print system information
 
@@ -130,7 +130,7 @@ It is important to note that the generated text may be shorter than the specifie
 
 -   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
 
-The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run.
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
 
 ### Temperature
 
 
@@ -22,6 +22,9 @@
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
 #include <signal.h>
 #endif
 
@@ -84,7 +87,7 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
-    if (params.seed <= 0) {
+    if (params.seed < 0) {
         params.seed = time(NULL);
     }
 
@@ -101,34 +104,11 @@ int main(int argc, char ** argv) {
     llama_context * ctx;
     g_ctx = &ctx;
 
-    // load the model
-    {
-        auto lparams = llama_context_default_params();
-
-        lparams.n_ctx      = params.n_ctx;
-        lparams.n_parts    = params.n_parts;
-        lparams.seed       = params.seed;
-        lparams.f16_kv     = params.memory_f16;
-        lparams.use_mmap   = params.use_mmap;
-        lparams.use_mlock  = params.use_mlock;
-
-        ctx = llama_init_from_file(params.model.c_str(), lparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-    }
-
-    if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(ctx,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return 1;
-        }
+    // load the model and apply lora adapter, if any
+    ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
     }
 
     // print system information
@@ -263,7 +243,10 @@ int main(int argc, char ** argv) {
         sigint_action.sa_flags = 0;
         sigaction(SIGINT, &sigint_action, NULL);
 #elif defined (_WIN32)
-        signal(SIGINT, sigint_handler);
+        auto console_ctrl_handler = [](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
         fprintf(stderr, "%s: interactive mode on.\n", __func__);
@@ -298,17 +281,17 @@ int main(int argc, char ** argv) {
     }
 
     bool is_antiprompt = false;
-    bool input_noecho  = false;
+    bool input_echo    = true;
 
     // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
     // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
     // initial prompt so it doesn't need to be an exact match.
     bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
 
 
-    int n_past     = 0;
-    int n_remain   = params.n_predict;
-    int n_consumed = 0;
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
     int n_session_consumed = 0;
 
     // the first thing we will do is to output the prompt, so set color accordingly
@@ -413,7 +396,7 @@ int main(int argc, char ** argv) {
             llama_token id = 0;
 
             {
-                auto logits = llama_get_logits(ctx);
+                auto logits  = llama_get_logits(ctx);
                 auto n_vocab = llama_n_vocab(ctx);
 
                 // Apply params.logit_bias map
@@ -485,7 +468,7 @@ int main(int argc, char ** argv) {
             embd.push_back(id);
 
             // echo this to console
-            input_noecho = false;
+            input_echo = true;
 
             // decrement remaining sampling budget
             --n_remain;
@@ -503,14 +486,14 @@ int main(int argc, char ** argv) {
         }
 
         // display text
-        if (!input_noecho) {
+        if (input_echo) {
             for (auto id : embd) {
                 printf("%s", llama_token_to_str(ctx, id));
             }
             fflush(stdout);
         }
         // reset color to default if we there is no pending user input
-        if (!input_noecho && (int)embd_inp.size() == n_consumed) {
+        if (input_echo && (int)embd_inp.size() == n_consumed) {
             set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
         }
 
@@ -542,11 +525,6 @@ int main(int argc, char ** argv) {
                 // potentially set color to indicate we are taking user input
                 set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
 
-#if defined (_WIN32)
-                // Windows: must reactivate sigint handler after each signal
-                signal(SIGINT, sigint_handler);
-#endif
-
                 if (params.instruct) {
                     printf("\n> ");
                 }
@@ -605,7 +583,7 @@ int main(int argc, char ** argv) {
                     n_remain -= line_inp.size();
                 }
 
-                input_noecho = true; // do not echo this again
+                input_echo = false; // do not echo this again
             }
 
             if (n_past > 0) {
@@ -630,10 +608,6 @@ int main(int argc, char ** argv) {
         }
     }
 
-#if defined (_WIN32)
-    signal(SIGINT, SIG_DFL);
-#endif
-
     llama_print_timings(ctx);
     llama_free(ctx);