Facebook
From mlt, 13 Hours ago, written in Logcat.
Embed
  1. time=2025-06-23T14:18:35.139-05:00 level=INFO source=routes.go:1235 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:8192 OLLAMA_DEBUG:INFO OLLAMA_FLASH_ATTENTION:true OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:C:\\\\Users\\\\mlt\\\\.ollama\\\\models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES:]"
  2. time=2025-06-23T14:18:35.154-05:00 level=INFO source=images.go:476 msg="total blobs: 30"
  3. time=2025-06-23T14:18:35.159-05:00 level=INFO source=images.go:483 msg="total unused blobs removed: 0"
  4. time=2025-06-23T14:18:35.163-05:00 level=INFO source=routes.go:1288 msg="Listening on 127.0.0.1:11434 (version 0.9.2-8-g2bb69b4-dirty)"
  5. time=2025-06-23T14:18:35.163-05:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
  6. time=2025-06-23T14:18:35.163-05:00 level=INFO source=gpu_windows.go:167 msg=packages count=1
  7. time=2025-06-23T14:18:35.163-05:00 level=INFO source=gpu_windows.go:214 msg="" package=0 cores=4 efficiency=0 threads=8
  8. time=2025-06-23T14:18:36.259-05:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=rocm variant="" compute=gfx1034 driver=6.4 name="AMD Radeon RX 6400" total="4.0 GiB" available="3.8 GiB"
  9. [GIN] 2025/06/23 - 14:19:13 | 200 |      1.0858ms |       127.0.0.1 | HEAD     "/"
  10. [GIN] 2025/06/23 - 14:19:13 | 200 |    185.6844ms |       127.0.0.1 | POST     "/api/show"
  11. time=2025-06-23T14:19:14.877-05:00 level=INFO source=sched.go:189 msg="one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency"
  12. time=2025-06-23T14:19:15.836-05:00 level=INFO source=sched.go:788 msg="new model will fit in available VRAM in single GPU, loading" model=C:\Users\mlt\.ollama\models\blobs\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 gpu=0 parallel=2 available=3983867904 required="3.4 GiB"
  13. time=2025-06-23T14:19:16.811-05:00 level=INFO source=server.go:135 msg="system memory" total="24.0 GiB" free="10.5 GiB" free_swap="19.7 GiB"
  14. time=2025-06-23T14:19:17.785-05:00 level=INFO source=server.go:168 msg=offload library=rocm layers.requested=-1 layers.model=37 layers.offload=37 layers.split="" memory.available="[3.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="3.4 GiB" memory.required.partial="3.4 GiB" memory.required.kv="576.0 MiB" memory.required.allocations="[3.4 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="552.0 MiB" memory.graph.partial="680.0 MiB"
  15. time=2025-06-23T14:19:17.786-05:00 level=INFO source=server.go:211 msg="enabling flash attention"
  16. time=2025-06-23T14:19:17.786-05:00 level=WARN source=server.go:219 msg="kv cache type not supported by model" type=""
  17. llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
  18. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  19. llama_model_loader: - kv   0:                       general.architecture str              = qwen2
  20. llama_model_loader: - kv   1:                               general.type str              = model
  21. llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
  22. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  23. llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
  24. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  25. llama_model_loader: - kv   6:                            general.license str              = other
  26. llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
  27. llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
  28. llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
  29. llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
  30. llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
  31. llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
  32. llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
  33. llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
  34. llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
  35. llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
  36. llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
  37. llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
  38. llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
  39. llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
  40. llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
  41. llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
  42. llama_model_loader: - kv  23:                          general.file_type u32              = 15
  43. llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
  44. llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
  45. llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  46. llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  47. llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
  48. llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
  49. llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
  50. llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
  51. llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
  52. llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
  53. llama_model_loader: - kv  34:               general.quantization_version u32              = 2
  54. llama_model_loader: - type  f32:  181 tensors
  55. llama_model_loader: - type q4_K:  216 tensors
  56. llama_model_loader: - type q6_K:   37 tensors
  57. print_info: file format = GGUF V3 (latest)
  58. print_info: file type   = Q4_K - Medium
  59. print_info: file size   = 1.79 GiB (4.99 BPW)
  60. load: special tokens cache size = 22
  61. load: token to piece cache size = 0.9310 MB
  62. print_info: arch             = qwen2
  63. print_info: vocab_only       = 1
  64. print_info: model type       = ?B
  65. print_info: model params     = 3.09 B
  66. print_info: general.name     = Qwen2.5 3B Instruct
  67. print_info: vocab type       = BPE
  68. print_info: n_vocab          = 151936
  69. print_info: n_merges         = 151387
  70. print_info: BOS token        = 151643 '<|endoftext|>'
  71. print_info: EOS token        = 151645 '<|im_end|>'
  72. print_info: EOT token        = 151645 '<|im_end|>'
  73. print_info: PAD token        = 151643 '<|endoftext|>'
  74. print_info: LF token         = 198 'Ċ'
  75. print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
  76. print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
  77. print_info: FIM MID token    = 151660 '<|fim_middle|>'
  78. print_info: FIM PAD token    = 151662 '<|fim_pad|>'
  79. print_info: FIM REP token    = 151663 '<|repo_name|>'
  80. print_info: FIM SEP token    = 151664 '<|file_sep|>'
  81. print_info: EOG token        = 151643 '<|endoftext|>'
  82. print_info: EOG token        = 151645 '<|im_end|>'
  83. print_info: EOG token        = 151662 '<|fim_pad|>'
  84. print_info: EOG token        = 151663 '<|repo_name|>'
  85. print_info: EOG token        = 151664 '<|file_sep|>'
  86. print_info: max token length = 256
  87. llama_model_load: vocab only - skipping tensors
  88. time=2025-06-23T14:19:18.263-05:00 level=INFO source=server.go:431 msg="starting llama server" cmd="C:\\\\dev\\\\ollama\\\\ollama\\\\ollama.exe runner --model C:\\\\Users\\\\mlt\\\\.ollama\\\\models\\\\blobs\\\\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 16384 --batch-size 512 --n-gpu-layers 37 --threads 4 --flash-attn --parallel 2 --port 50349"
  89. time=2025-06-23T14:19:18.273-05:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
  90. time=2025-06-23T14:19:18.273-05:00 level=INFO source=server.go:591 msg="waiting for llama runner to start responding"
  91. time=2025-06-23T14:19:18.275-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server error"
  92. time=2025-06-23T14:19:18.368-05:00 level=INFO source=runner.go:815 msg="starting go runner"
  93. ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
  94. ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
  95. ggml_cuda_init: found 1 ROCm devices:
  96.   Device 0: AMD Radeon RX 6400, gfx1034 (0x1034), VMM: no, Wave Size: 32
  97. load_backend: loaded ROCm backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-hip.dll
  98. load_backend: loaded CPU backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-cpu-sandybridge.dll
  99. time=2025-06-23T14:19:18.749-05:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.LLAMAFILE=1 CPU.1.SSE3=1 CPU.1.LLAMAFILE=1 ROCm.0.NO_VMM=1 ROCm.0.NO_PEER_COPY=1 ROCm.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(gcc)
  100. time=2025-06-23T14:19:18.753-05:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:50349"
  101. time=2025-06-23T14:19:18.778-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server loading model"
  102. llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon RX 6400) - 3939 MiB free
  103. llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
  104. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  105. llama_model_loader: - kv   0:                       general.architecture str              = qwen2
  106. llama_model_loader: - kv   1:                               general.type str              = model
  107. llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
  108. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  109. llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
  110. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  111. llama_model_loader: - kv   6:                            general.license str              = other
  112. llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
  113. llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
  114. llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
  115. llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
  116. llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
  117. llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
  118. llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
  119. llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
  120. llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
  121. llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
  122. llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
  123. llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
  124. llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
  125. llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
  126. llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
  127. llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
  128. llama_model_loader: - kv  23:                          general.file_type u32              = 15
  129. llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
  130. llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
  131. llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  132. llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  133. llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
  134. llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
  135. llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
  136. llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
  137. llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
  138. llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
  139. llama_model_loader: - kv  34:               general.quantization_version u32              = 2
  140. llama_model_loader: - type  f32:  181 tensors
  141. llama_model_loader: - type q4_K:  216 tensors
  142. llama_model_loader: - type q6_K:   37 tensors
  143. print_info: file format = GGUF V3 (latest)
  144. print_info: file type   = Q4_K - Medium
  145. print_info: file size   = 1.79 GiB (4.99 BPW)
  146. load: special tokens cache size = 22
  147. load: token to piece cache size = 0.9310 MB
  148. print_info: arch             = qwen2
  149. print_info: vocab_only       = 0
  150. print_info: n_ctx_train      = 32768
  151. print_info: n_embd           = 2048
  152. print_info: n_layer          = 36
  153. print_info: n_head           = 16
  154. print_info: n_head_kv        = 2
  155. print_info: n_rot            = 128
  156. print_info: n_swa            = 0
  157. print_info: n_swa_pattern    = 1
  158. print_info: n_embd_head_k    = 128
  159. print_info: n_embd_head_v    = 128
  160. print_info: n_gqa            = 8
  161. print_info: n_embd_k_gqa     = 256
  162. print_info: n_embd_v_gqa     = 256
  163. print_info: f_norm_eps       = 0.0e+00
  164. print_info: f_norm_rms_eps   = 1.0e-06
  165. print_info: f_clamp_kqv      = 0.0e+00
  166. print_info: f_max_alibi_bias = 0.0e+00
  167. print_info: f_logit_scale    = 0.0e+00
  168. print_info: f_attn_scale     = 0.0e+00
  169. print_info: n_ff             = 11008
  170. print_info: n_expert         = 0
  171. print_info: n_expert_used    = 0
  172. print_info: causal attn      = 1
  173. print_info: pooling type     = -1
  174. print_info: rope type        = 2
  175. print_info: rope scaling     = linear
  176. print_info: freq_base_train  = 1000000.0
  177. print_info: freq_scale_train = 1
  178. print_info: n_ctx_orig_yarn  = 32768
  179. print_info: rope_finetuned   = unknown
  180. print_info: ssm_d_conv       = 0
  181. print_info: ssm_d_inner      = 0
  182. print_info: ssm_d_state      = 0
  183. print_info: ssm_dt_rank      = 0
  184. print_info: ssm_dt_b_c_rms   = 0
  185. print_info: model type       = 3B
  186. print_info: model params     = 3.09 B
  187. print_info: general.name     = Qwen2.5 3B Instruct
  188. print_info: vocab type       = BPE
  189. print_info: n_vocab          = 151936
  190. print_info: n_merges         = 151387
  191. print_info: BOS token        = 151643 '<|endoftext|>'
  192. print_info: EOS token        = 151645 '<|im_end|>'
  193. print_info: EOT token        = 151645 '<|im_end|>'
  194. print_info: PAD token        = 151643 '<|endoftext|>'
  195. print_info: LF token         = 198 'Ċ'
  196. print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
  197. print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
  198. print_info: FIM MID token    = 151660 '<|fim_middle|>'
  199. print_info: FIM PAD token    = 151662 '<|fim_pad|>'
  200. print_info: FIM REP token    = 151663 '<|repo_name|>'
  201. print_info: FIM SEP token    = 151664 '<|file_sep|>'
  202. print_info: EOG token        = 151643 '<|endoftext|>'
  203. print_info: EOG token        = 151645 '<|im_end|>'
  204. print_info: EOG token        = 151662 '<|fim_pad|>'
  205. print_info: EOG token        = 151663 '<|repo_name|>'
  206. print_info: EOG token        = 151664 '<|file_sep|>'
  207. print_info: max token length = 256
  208. load_tensors: loading model tensors, this can take a while... (mmap = true)
  209. load_tensors: offloading 36 repeating layers to GPU
  210. load_tensors: offloading output layer to GPU
  211. load_tensors: offloaded 37/37 layers to GPU
  212. load_tensors:   CPU_Mapped model buffer size =   243.43 MiB
  213. load_tensors:        ROCm0 model buffer size =  1834.83 MiB
  214. llama_context: constructing llama_context
  215. llama_context: n_seq_max     = 2
  216. llama_context: n_ctx         = 16384
  217. llama_context: n_ctx_per_seq = 8192
  218. llama_context: n_batch       = 1024
  219. llama_context: n_ubatch      = 512
  220. llama_context: causal_attn   = 1
  221. llama_context: flash_attn    = 1
  222. llama_context: freq_base     = 1000000.0
  223. llama_context: freq_scale    = 1
  224. llama_context: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
  225. llama_context:  ROCm_Host  output buffer size =     1.17 MiB
  226. llama_kv_cache_unified: kv_size = 16384, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 256
  227. llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
  228. llama_kv_cache_unified: KV self size  =  576.00 MiB, K (f16):  288.00 MiB, V (f16):  288.00 MiB
  229. llama_context:      ROCm0 compute buffer size =   300.75 MiB
  230. llama_context:  ROCm_Host compute buffer size =    36.01 MiB
  231. llama_context: graph nodes  = 1195
  232. llama_context: graph splits = 2
  233. time=2025-06-23T14:19:28.566-05:00 level=INFO source=server.go:630 msg="llama runner started in 10.29 seconds"
  234. [GIN] 2025/06/23 - 14:19:28 | 200 |   14.6728392s |       127.0.0.1 | POST     "/api/generate"
  235. [GIN] 2025/06/23 - 14:20:32 | 200 |   42.1578579s |       127.0.0.1 | POST     "/api/chat"
  236. [GIN] 2025/06/23 - 14:22:15 | 200 |       567.5µs |       127.0.0.1 | HEAD     "/"
  237. [GIN] 2025/06/23 - 14:22:15 | 200 |            0s |       127.0.0.1 | GET      "/api/ps"
  238. [GIN] 2025/06/23 - 14:22:17 | 200 |         1m13s |       127.0.0.1 | POST     "/api/chat"
  239.  time=2025-06-27T13:09:15.701-05:00 level=INFO source=server.go:135 msg="system memory" total="24.0 GiB" free="7.0 GiB" free_swap="13.3 GiB"
  240.  time=2025-06-27T13:09:16.700-05:00 level=INFO source=server.go:168 msg=offload library=rocm layers.requested=-1 layers.model=29 layers.offload=23 layers.split="" memory.available="[3.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="3.8 GiB" memory.required.partial="3.1 GiB" memory.required.kv="896.0 MiB" memory.required.allocati GiB]" memory.weights.total="1.9 GiB" memory.weights.repeating="1.6 GiB" memory.weights.n MiB" memory.graph.full="424.0 MiB" memory.graph.partial="570.7 MiB"
  241. time=2025-06-27T13:09:16.701-05:00 level=INFO source=server.go:211 msg="enabling flash attention"
  242. time=2025-06-27T13:09:16.701-05:00 level=WARN source=server.go:219 msg="kv cache type not supported by model" type=""
  243. llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff (version GGUF V3 (latest))
  244. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  245. llama_model_loader: - kv   0:                       general.architecture str              = llama
  246. llama_model_loader: - kv   1:                               general.type str              = model
  247. llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
  248. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  249. llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
  250. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  251. llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
  252. llama_model_loader: - kv   7:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
  253. llama_model_loader: - kv   8:                          llama.block_count u32              = 28
  254. llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
  255. llama_model_loader: - kv  10:                     llama.embedding_length u32              = 3072
  256. llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 8192
  257. llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 24
  258. llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
  259. llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
  260. llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
  261. llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
  262. llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
  263. llama_model_loader: - kv  18:                          general.file_type u32              = 15
  264. llama_model_loader: - kv  19:                           llama.vocab_size u32              = 128256
  265. llama_model_loader: - kv  20:                 llama.rope.dimension_count u32              = 128
  266. llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
  267. llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = llama-bpe
  268. llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  269. llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  270. llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
  271. llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 128000
  272. llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 128009
  273. llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
  274. llama_model_loader: - kv  29:               general.quantization_version u32              = 2
  275. llama_model_loader: - type  f32:   58 tensors
  276. llama_model_loader: - type q4_K:  168 tensors
  277. llama_model_loader: - type q6_K:   29 tensors
  278. print_info: file format = GGUF V3 (latest)
  279. print_info: file type   = Q4_K - Medium
  280. print_info: file size   = 1.87 GiB (5.01 BPW)
  281. load: special tokens cache size = 256
  282. load: token to piece cache size = 0.7999 MB
  283. print_info: arch             = llama
  284.  print_info: vocab_
  285. print_info: model type       = ?B
  286. print_info: model params     = 3.21 B
  287. print_info: general.name     = Llama 3.2 3B Instruct
  288. print_info: vocab type       = BPE
  289. print_info: n_vocab          = 128256
  290. print_info: n_merges         = 280147
  291. print_info: BOS token        = 128000 '<|begin_of_text|>'
  292. print_info: EOS token        = 128009 '<|eot_id|>'
  293. print_info: EOT token        = 128009 '<|eot_id|>'
  294. print_info: EOM token        = 128008 '<|eom_id|>'
  295. print_info: LF token         = 198 'Ċ'
  296. print_info: EOG token        = 128008 '<|eom_id|>'
  297. print_info: EOG token        = 128009 '<|eot_id|>'
  298. print_info: max token length = 256
  299. llama_model_load: vocab only - skipping tensors
  300. time=2025-06-27T13:09:17.256-05:00 level=INFO source=server.go:431 msg="starting llama server" cmd="C:\\\\dev\\\\ollama\\\\ollama\\\\ollama.exe runner --model C:\\\\Users\\\\mlt\\\\.ollama\\\\models\\\\blobs\\\\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff --ctx-size 8192 --batch-size 512 --n-gpu-layers 23 --threads 4 --flash-attn --parallel 1 --port 58003"
  301. time=2025-06-27T13:09:17.322-05:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
  302. time=2025-06-27T13:09:17.323-05:00 level=INFO source=server.go:591 msg="waiting for llama runner to start responding"
  303. time=2025-06-27T13:09:17.329-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server error"
  304. time=2025-06-27T13:09:17.479-05:00 level=INFO source=runner.go:815 msg="starting go runner"
  305. ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
  306. ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
  307. ggml_cuda_init: found 1 ROCm devices:
  308.   Device 0: AMD Radeon RX 6400, gfx1034 (0x1034), VMM: no, Wave Size: 32
  309. load_backend: loaded ROCm backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-hip.dll
  310. load_backend: loaded CPU backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-cpu-sandybridge.dll
  311. time=2025-06-27T13:09:18.040-05:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.LLAMAFILE=1 CPU.1.SSE3=1 CPU.1.LLAMAFILE=1 ROCm.0.NO_VMM=1 ROCm.0.NO_PEER_COPY=1 ROCm.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(gcc)
  312. time=2025-06-27T13:09:18.045-05:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:58003"
  313. time=2025-06-27T13:09:18.087-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server loading model"
  314. llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon RX 6400) - 3939 MiB free
  315. llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff (version GGUF V3 (latest))
  316. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  317. llama_model_loader: - kv   0:                       general.architecture str              = llama
  318. llama_model_loader: - kv   1:                               general.type str              = model
  319. llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
  320. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  321. llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
  322. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  323. llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
  324. llama_model_loader: - kv   7:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
  325. llama_model_loader: - kv   8:                          llama.block_count u32              = 28
  326. llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
  327. llama_model_loader: - kv  10:                     llama.embedding_length u32              = 3072
  328. llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 8192
  329. llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 24
  330. llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
  331. llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
  332. llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
  333. llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
  334. llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
  335. llama_model_loader: - kv  18:                          general.file_type u32              = 15
  336. llama_model_loader: - kv  19:                           llama.vocab_size u32              = 128256
  337. llama_model_loader: - kv  20:                 llama.rope.dimension_count u32              = 128
  338. llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
  339. llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = llama-bpe
  340. llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  341. llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  342. llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
  343. llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 128000
  344. llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 128009
  345. llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
  346. llama_model_loader: - kv  29:               general.quantization_version u32              = 2
  347. llama_model_loader: - type  f32:   58 tensors
  348. llama_model_loader: - type q4_K:  168 tensors
  349. llama_model_loader: - type q6_K:   29 tensors
  350. print_info: file format = GGUF V3 (latest)
  351. print_info: file type   = Q4_K - Medium
  352. print_info: file size   = 1.87 GiB (5.01 BPW)
  353. load: special tokens cache size = 256
  354. load: token to piece cache size = 0.7999 MB
  355. print_info: arch             = llama
  356. print_info: vocab_only       = 0
  357. print_info: n_ctx_train      = 131072
  358. print_info: n_embd           = 3072
  359. print_info: n_layer          = 28
  360. print_info: n_head           = 24
  361. print_info: n_head_kv        = 8
  362. print_info: n_rot            = 128
  363. print_info: n_swa            = 0
  364. print_info: n_swa_pattern    = 1
  365. print_info: n_embd_head_k    = 128
  366. print_info: n_embd_head_v    = 128
  367. print_info: n_gqa            = 3
  368. print_info: n_embd_k_gqa     = 1024
  369. print_info: n_embd_v_gqa     = 1024
  370. print_info: f_norm_eps       = 0.0e+00
  371. print_info: f_norm_rms_eps   = 1.0e-05
  372. print_info: f_clamp_kqv      = 0.0e+00
  373. print_info: f_max_alibi_bias = 0.0e+00
  374. print_info: f_logit_scale    = 0.0e+00
  375. print_info: f_attn_scale     = 0.0e+00
  376. print_info: n_ff             = 8192
  377. print_info: n_expert         = 0
  378. print_info: n_expert_used    = 0
  379. print_info: causal attn      = 1
  380. print_info: pooling type     = 0
  381. print_info: rope type        = 0
  382. print_info: rope scaling     = linear
  383. print_info: freq_base_train  = 500000.0
  384. print_info: freq_scale_train = 1
  385. print_info: n_ctx_orig_yarn  = 131072
  386. print_info: rope_finetuned   = unknown
  387. print_info: ssm_d_conv       = 0
  388. print_info: ssm_d_inner      = 0
  389. print_info: ssm_d_state      = 0
  390. print_info: ssm_dt_rank      = 0
  391. print_info: ssm_dt_b_c_rms   = 0
  392. print_info: model type       = 3B
  393. print_info: model params     = 3.21 B
  394. print_info: general.name     = Llama 3.2 3B Instruct
  395. print_info: vocab type       = BPE
  396. print_info: n_vocab          = 128256
  397. print_info: n_merges         = 280147
  398. print_info: BOS token        = 128000 '<|begin_of_text|>'
  399. print_info: EOS token        = 128009 '<|eot_id|>'
  400. print_info: EOT token        = 128009 '<|eot_id|>'
  401. print_info: EOM token        = 128008 '<|eom_id|>'
  402. print_info: LF token         = 198 'Ċ'
  403. print_info: EOG token        = 128008 '<|eom_id|>'
  404. print_info: EOG token        = 128009 '<|eot_id|>'
  405. print_info: max token length = 256
  406. load_tensors: loading model tensors, this can take a while... (mmap = true)
  407. load_tensors: offloading 23 repeating layers to GPU
  408. load_tensors: offloaded 23/29 layers to GPU
  409. load_tensors:        ROCm0 model buffer size =  1319.11 MiB
  410. load_tensors:   CPU_Mapped model buffer size =  1918.35 MiB
  411. llama_context: constructing llama_context
  412. llama_context: n_seq_max     = 1
  413. llama_context: n_ctx         = 8192
  414. llama_context: n_ctx_per_seq = 8192
  415. llama_context: n_batch       = 512
  416. llama_context: n_ubatch      = 512
  417. llama_context: causal_attn   = 1
  418. llama_context: flash_attn    = 1
  419. llama_context: freq_base     = 500000.0
  420. llama_context: freq_scale    = 1
  421. llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
  422. llama_context:        CPU  output buffer size =     0.50 MiB
  423. llama_kv_cache_unified: kv_size = 8192, type_k = 'f16', type_v = 'f16', n_layer = 28, can_shift = 1, padding = 256
  424. llama_kv_cache_unified:      ROCm0 KV buffer size =   736.00 MiB
  425. llama_kv_cache_unified:        CPU KV buffer size =   160.00 MiB
  426. llama_kv_cache_unified: KV self size  =  896.00 MiB, K (f16):  448.00 MiB, V (f16):  448.00 MiB
  427. llama_context:      ROCm0 compute buffer size =   564.73 MiB
  428. llama_context:  ROCm_Host compute buffer size =    22.01 MiB
  429. llama_context: graph nodes  = 847
  430. llama_context: graph splits = 60 (with bs=512), 3 (with bs=1)
  431. time=2025-06-27T13:09:28.121-05:00 level=INFO source=server.go:630 msg="llama runner started in 10.80 seconds"
  432. time=2025-06-27T13:09:28.178-05:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=8192 prompt=30805 keep=5 new=8192
  433. [GIN] 2025/06/27 - 13:10:07 | 200 |      2.2901ms |       127.0.0.1 | HEAD     "/"
  434. [GIN] 2025/06/27 - 13:10:07 | 200 |       546.3µs |       127.0.0.1 | GET      "/api/ps"
  435. [GIN] 2025/06/27 - 13:18:55 | 200 |         9m49s |       127.0.0.1 | POST     "/api/chat"
  436. [GIN] 2025/06/27 - 13:19:07 | 200 |            0s |       127.0.0.1 | HEAD     "/"
  437. [GIN] 2025/06/27 - 13:19:07 | 200 |            0s |       127.0.0.1 | GET      "/api/ps"
  438. [GIN] 2025/06/27 - 13:20:06 | 200 |            0s |       127.0.0.1 | HEAD     "/"
  439. [GIN] 2025/06/27 - 13:20:06 | 200 |            0s |       127.0.0.1 | GET      "/api/ps"
  440. [GIN] 2025/06/27 - 13:20:15 | 200 |       551.1µs |       127.0.0.1 | HEAD     "/"
  441. [GIN] 2025/06/27 - 13:20:15 | 200 |      8.5213ms |       127.0.0.1 | POST     "/api/generate"
  442. [GIN] 2025/06/27 - 13:20:19 | 200 |            0s |       127.0.0.1 | HEAD     "/"
  443. [GIN] 2025/06/27 - 13:20:19 | 200 |            0s |       127.0.0.1 | GET      "/api/ps"
  444.  time=2025-06-27T13:22:38.384-05:00 level=INFO source=server.go:135 msg="system memory" total="24.0 GiB" free="7.6 GiB" free_swap="12.2 GiB"
  445.  time=2025-06-27T13:22:39.446-05:00 level=INFO source=server.go:168 msg=offload library=rocm layers.requested=-1 layers.model=29 layers.offload=12 layers.split="" memory.available="[2.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="3.8 GiB" memory.required.partial="2.2 GiB" memory.required.kv="896.0 MiB" memory.required.allocati GiB]" memory.weights.total="1.9 GiB" memory.weights.repeating="1.6 GiB" memory.weights.n MiB" memory.graph.full="424.0 MiB" memory.graph.partial="570.7 MiB"
  446. time=2025-06-27T13:22:39.446-05:00 level=INFO source=server.go:211 msg="enabling flash attention"
  447. time=2025-06-27T13:22:39.446-05:00 level=WARN source=server.go:219 msg="kv cache type not supported by model" type=""
  448. llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff (version GGUF V3 (latest))
  449. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  450. llama_model_loader: - kv   0:                       general.architecture str              = llama
  451. llama_model_loader: - kv   1:                               general.type str              = model
  452. llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
  453. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  454. llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
  455. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  456. llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
  457. llama_model_loader: - kv   7:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
  458. llama_model_loader: - kv   8:                          llama.block_count u32              = 28
  459. llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
  460. llama_model_loader: - kv  10:                     llama.embedding_length u32              = 3072
  461. llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 8192
  462. llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 24
  463. llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
  464. llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
  465. llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
  466. llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
  467. llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
  468. llama_model_loader: - kv  18:                          general.file_type u32              = 15
  469. llama_model_loader: - kv  19:                           llama.vocab_size u32              = 128256
  470. llama_model_loader: - kv  20:                 llama.rope.dimension_count u32              = 128
  471. llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
  472. llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = llama-bpe
  473. llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  474. llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  475. llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
  476. llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 128000
  477. llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 128009
  478. llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
  479. llama_model_loader: - kv  29:               general.quantization_version u32              = 2
  480. llama_model_loader: - type  f32:   58 tensors
  481. llama_model_loader: - type q4_K:  168 tensors
  482. llama_model_loader: - type q6_K:   29 tensors
  483. print_info: file format = GGUF V3 (latest)
  484. print_info: file type   = Q4_K - Medium
  485. print_info: file size   = 1.87 GiB (5.01 BPW)
  486. load: special tokens cache size = 256
  487. load: token to piece cache size = 0.7999 MB
  488. print_info: arch             = llama
  489.  print_info: vocab_
  490. print_info: model type       = ?B
  491. print_info: model params     = 3.21 B
  492. print_info: general.name     = Llama 3.2 3B Instruct
  493. print_info: vocab type       = BPE
  494. print_info: n_vocab          = 128256
  495. print_info: n_merges         = 280147
  496. print_info: BOS token        = 128000 '<|begin_of_text|>'
  497. print_info: EOS token        = 128009 '<|eot_id|>'
  498. print_info: EOT token        = 128009 '<|eot_id|>'
  499. print_info: EOM token        = 128008 '<|eom_id|>'
  500. print_info: LF token         = 198 'Ċ'
  501. print_info: EOG token        = 128008 '<|eom_id|>'
  502. print_info: EOG token        = 128009 '<|eot_id|>'
  503. print_info: max token length = 256
  504. llama_model_load: vocab only - skipping tensors
  505. time=2025-06-27T13:22:39.927-05:00 level=INFO source=server.go:431 msg="starting llama server" cmd="C:\\\\dev\\\\ollama\\\\ollama\\\\ollama.exe runner --model C:\\\\Users\\\\mlt\\\\.ollama\\\\models\\\\blobs\\\\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff --ctx-size 8192 --batch-size 512 --n-gpu-layers 12 --threads 4 --flash-attn --parallel 1 --port 58424"
  506. time=2025-06-27T13:22:39.936-05:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
  507. time=2025-06-27T13:22:39.936-05:00 level=INFO source=server.go:591 msg="waiting for llama runner to start responding"
  508. time=2025-06-27T13:22:39.937-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server error"
  509. time=2025-06-27T13:22:40.027-05:00 level=INFO source=runner.go:815 msg="starting go runner"
  510. ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
  511. ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
  512. ggml_cuda_init: found 1 ROCm devices:
  513.   Device 0: AMD Radeon RX 6400, gfx1034 (0x1034), VMM: no, Wave Size: 32
  514. load_backend: loaded ROCm backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-hip.dll
  515. load_backend: loaded CPU backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-cpu-sandybridge.dll
  516. time=2025-06-27T13:22:40.132-05:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.LLAMAFILE=1 CPU.1.SSE3=1 CPU.1.LLAMAFILE=1 ROCm.0.NO_VMM=1 ROCm.0.NO_PEER_COPY=1 ROCm.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(gcc)
  517. time=2025-06-27T13:22:40.134-05:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:58424"
  518. time=2025-06-27T13:22:40.190-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server loading model"
  519. llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon RX 6400) - 3939 MiB free
  520. llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff (version GGUF V3 (latest))
  521. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  522. llama_model_loader: - kv   0:                       general.architecture str              = llama
  523. llama_model_loader: - kv   1:                               general.type str              = model
  524. llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
  525. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  526. llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
  527. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  528. llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
  529. llama_model_loader: - kv   7:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
  530. llama_model_loader: - kv   8:                          llama.block_count u32              = 28
  531. llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
  532. llama_model_loader: - kv  10:                     llama.embedding_length u32              = 3072
  533. llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 8192
  534. llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 24
  535. llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
  536. llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
  537. llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
  538. llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
  539. llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
  540. llama_model_loader: - kv  18:                          general.file_type u32              = 15
  541. llama_model_loader: - kv  19:                           llama.vocab_size u32              = 128256
  542. llama_model_loader: - kv  20:                 llama.rope.dimension_count u32              = 128
  543. llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
  544. llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = llama-bpe
  545. llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  546. llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  547. llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
  548. llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 128000
  549. llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 128009
  550. llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
  551. llama_model_loader: - kv  29:               general.quantization_version u32              = 2
  552. llama_model_loader: - type  f32:   58 tensors
  553. llama_model_loader: - type q4_K:  168 tensors
  554. llama_model_loader: - type q6_K:   29 tensors
  555. print_info: file format = GGUF V3 (latest)
  556. print_info: file type   = Q4_K - Medium
  557. print_info: file size   = 1.87 GiB (5.01 BPW)
  558. load: special tokens cache size = 256
  559. load: token to piece cache size = 0.7999 MB
  560. print_info: arch             = llama
  561. print_info: vocab_only       = 0
  562. print_info: n_ctx_train      = 131072
  563. print_info: n_embd           = 3072
  564. print_info: n_layer          = 28
  565. print_info: n_head           = 24
  566. print_info: n_head_kv        = 8
  567. print_info: n_rot            = 128
  568. print_info: n_swa            = 0
  569. print_info: n_swa_pattern    = 1
  570. print_info: n_embd_head_k    = 128
  571. print_info: n_embd_head_v    = 128
  572. print_info: n_gqa            = 3
  573. print_info: n_embd_k_gqa     = 1024
  574. print_info: n_embd_v_gqa     = 1024
  575. print_info: f_norm_eps       = 0.0e+00
  576. print_info: f_norm_rms_eps   = 1.0e-05
  577. print_info: f_clamp_kqv      = 0.0e+00
  578. print_info: f_max_alibi_bias = 0.0e+00
  579. print_info: f_logit_scale    = 0.0e+00
  580. print_info: f_attn_scale     = 0.0e+00
  581. print_info: n_ff             = 8192
  582. print_info: n_expert         = 0
  583. print_info: n_expert_used    = 0
  584. print_info: causal attn      = 1
  585. print_info: pooling type     = 0
  586. print_info: rope type        = 0
  587. print_info: rope scaling     = linear
  588. print_info: freq_base_train  = 500000.0
  589. print_info: freq_scale_train = 1
  590. print_info: n_ctx_orig_yarn  = 131072
  591. print_info: rope_finetuned   = unknown
  592. print_info: ssm_d_conv       = 0
  593. print_info: ssm_d_inner      = 0
  594. print_info: ssm_d_state      = 0
  595. print_info: ssm_dt_rank      = 0
  596. print_info: ssm_dt_b_c_rms   = 0
  597. print_info: model type       = 3B
  598. print_info: model params     = 3.21 B
  599. print_info: general.name     = Llama 3.2 3B Instruct
  600. print_info: vocab type       = BPE
  601. print_info: n_vocab          = 128256
  602. print_info: n_merges         = 280147
  603. print_info: BOS token        = 128000 '<|begin_of_text|>'
  604. print_info: EOS token        = 128009 '<|eot_id|>'
  605. print_info: EOT token        = 128009 '<|eot_id|>'
  606. print_info: EOM token        = 128008 '<|eom_id|>'
  607. print_info: LF token         = 198 'Ċ'
  608. print_info: EOG token        = 128008 '<|eom_id|>'
  609. print_info: EOG token        = 128009 '<|eot_id|>'
  610. print_info: max token length = 256
  611. load_tensors: loading model tensors, this can take a while... (mmap = true)
  612. load_tensors: offloading 12 repeating layers to GPU
  613. load_tensors: offloaded 12/29 layers to GPU
  614. load_tensors:   CPU_Mapped model buffer size =  1918.35 MiB
  615. load_tensors:        ROCm0 model buffer size =   703.20 MiB
  616. llama_context: constructing llama_context
  617. llama_context: n_seq_max     = 1
  618. llama_context: n_ctx         = 8192
  619. llama_context: n_ctx_per_seq = 8192
  620. llama_context: n_batch       = 512
  621. llama_context: n_ubatch      = 512
  622. llama_context: causal_attn   = 1
  623. llama_context: flash_attn    = 1
  624. llama_context: freq_base     = 500000.0
  625. llama_context: freq_scale    = 1
  626. llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
  627. llama_context:        CPU  output buffer size =     0.50 MiB
  628. llama_kv_cache_unified: kv_size = 8192, type_k = 'f16', type_v = 'f16', n_layer = 28, can_shift = 1, padding = 256
  629. llama_kv_cache_unified:        CPU KV buffer size =   512.00 MiB
  630. llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
  631. llama_kv_cache_unified: KV self size  =  896.00 MiB, K (f16):  448.00 MiB, V (f16):  448.00 MiB
  632. llama_context:      ROCm0 compute buffer size =   564.73 MiB
  633. llama_context:  ROCm_Host compute buffer size =    22.01 MiB
  634. llama_context: graph nodes  = 847
  635. llama_context: graph splits = 181 (with bs=512), 3 (with bs=1)
  636. time=2025-06-27T13:22:47.212-05:00 level=INFO source=server.go:630 msg="llama runner started in 7.28 seconds"
  637. [GIN] 2025/06/27 - 13:30:58 | 200 |         8m26s |       127.0.0.1 | POST     "/api/chat"
  638.  time=2025-06-27T13:31:08.737-05:00 level=INFO source=server.go:135 msg="system memory" total="24.0 GiB" free="7.6 GiB" free_swap="10.7 GiB"
  639.  time=2025-06-27T13:31:10.166-05:00 level=INFO source=server.go:168 msg=offload library=rocm layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[1.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.9 GiB" memory.required.partial="0 B" memory.required.kv="1.1 GiB" memory.required.allocati B]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.n MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.3 GiB"
  640. time=2025-06-27T13:31:10.166-05:00 level=WARN source=server.go:199 msg="flash attention enabled but not supported by gpu"
  641. llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
  642. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  643. llama_model_loader: - kv   0:                       general.architecture str              = qwen2
  644. llama_model_loader: - kv   1:                               general.type str              = model
  645. llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
  646. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  647. llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
  648. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  649. llama_model_loader: - kv   6:                            general.license str              = other
  650. llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
  651. llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
  652. llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
  653. llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
  654. llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
  655. llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
  656. llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
  657. llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
  658. llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
  659. llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
  660. llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
  661. llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
  662. llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
  663. llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
  664. llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
  665. llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
  666. llama_model_loader: - kv  23:                          general.file_type u32              = 15
  667. llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
  668. llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
  669. llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  670. llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  671. llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
  672. llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
  673. llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
  674. llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
  675. llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
  676. llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
  677. llama_model_loader: - kv  34:               general.quantization_version u32              = 2
  678. llama_model_loader: - type  f32:  181 tensors
  679. llama_model_loader: - type q4_K:  216 tensors
  680. llama_model_loader: - type q6_K:   37 tensors
  681. print_info: file format = GGUF V3 (latest)
  682. print_info: file type   = Q4_K - Medium
  683. print_info: file size   = 1.79 GiB (4.99 BPW)
  684. load: special tokens cache size = 22
  685. load: token to piece cache size = 0.9310 MB
  686. print_info: arch             = qwen2
  687. print_info: vocab_only       = 1
  688. print_info: model type       = ?B
  689. print_info: model params     = 3.09 B
  690. print_info: general.name     = Qwen2.5 3B Instruct
  691. print_info: vocab type       = BPE
  692. print_info: n_vocab          = 151936
  693. print_info: n_merges         = 151387
  694. print_info: BOS token        = 151643 '<|endoftext|>'
  695. print_info: EOS token        = 151645 '<|im_end|>'
  696. print_info: EOT token        = 151645 '<|im_end|>'
  697. print_info: PAD token        = 151643 '<|endoftext|>'
  698. print_info: LF token         = 198 'Ċ'
  699. print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
  700. print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
  701. print_info: FIM MID token    = 151660 '<|fim_middle|>'
  702. print_info: FIM PAD token    = 151662 '<|fim_pad|>'
  703. print_info: FIM REP token    = 151663 '<|repo_name|>'
  704. print_info: FIM SEP token    = 151664 '<|file_sep|>'
  705. print_info: EOG token        = 151643 '<|endoftext|>'
  706. print_info: EOG token        = 151645 '<|im_end|>'
  707. print_info: EOG token        = 151662 '<|fim_pad|>'
  708. print_info: EOG token        = 151663 '<|repo_name|>'
  709. print_info: EOG token        = 151664 '<|file_sep|>'
  710. print_info: max token length = 256
  711. llama_model_load: vocab only - skipping tensors
  712. time=2025-06-27T13:31:10.883-05:00 level=INFO source=server.go:431 msg="starting llama server" cmd="C:\\\\dev\\\\ollama\\\\ollama\\\\ollama.exe runner --model C:\\\\Users\\\\mlt\\\\.ollama\\\\models\\\\blobs\\\\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 32768 --batch-size 512 --threads 4 --no-mmap --parallel 1 --port 58892"
  713. time=2025-06-27T13:31:10.896-05:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
  714. time=2025-06-27T13:31:10.896-05:00 level=INFO source=server.go:591 msg="waiting for llama runner to start responding"
  715. time=2025-06-27T13:31:10.900-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server error"
  716. time=2025-06-27T13:31:11.037-05:00 level=INFO source=runner.go:815 msg="starting go runner"
  717. load_backend: loaded CPU backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-cpu-sandybridge.dll
  718. time=2025-06-27T13:31:11.126-05:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.LLAMAFILE=1 CPU.1.SSE3=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
  719. time=2025-06-27T13:31:11.129-05:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:58892"
  720. time=2025-06-27T13:31:11.155-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server loading model"
  721. llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
  722. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  723. llama_model_loader: - kv   0:                       general.architecture str              = qwen2
  724. llama_model_loader: - kv   1:                               general.type str              = model
  725. llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
  726. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  727. llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
  728. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  729. llama_model_loader: - kv   6:                            general.license str              = other
  730. llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
  731. llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
  732. llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
  733. llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
  734. llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
  735. llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
  736. llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
  737. llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
  738. llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
  739. llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
  740. llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
  741. llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
  742. llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
  743. llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
  744. llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
  745. llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
  746. llama_model_loader: - kv  23:                          general.file_type u32              = 15
  747. llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
  748. llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
  749. llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  750. llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  751. llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
  752. llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
  753. llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
  754. llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
  755. llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
  756. llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
  757. llama_model_loader: - kv  34:               general.quantization_version u32              = 2
  758. llama_model_loader: - type  f32:  181 tensors
  759. llama_model_loader: - type q4_K:  216 tensors
  760. llama_model_loader: - type q6_K:   37 tensors
  761. print_info: file format = GGUF V3 (latest)
  762. print_info: file type   = Q4_K - Medium
  763. print_info: file size   = 1.79 GiB (4.99 BPW)
  764. load: special tokens cache size = 22
  765. load: token to piece cache size = 0.9310 MB
  766. print_info: arch             = qwen2
  767. print_info: vocab_only       = 0
  768. print_info: n_ctx_train      = 32768
  769. print_info: n_embd           = 2048
  770. print_info: n_layer          = 36
  771. print_info: n_head           = 16
  772. print_info: n_head_kv        = 2
  773. print_info: n_rot            = 128
  774. print_info: n_swa            = 0
  775. print_info: n_swa_pattern    = 1
  776. print_info: n_embd_head_k    = 128
  777. print_info: n_embd_head_v    = 128
  778. print_info: n_gqa            = 8
  779. print_info: n_embd_k_gqa     = 256
  780. print_info: n_embd_v_gqa     = 256
  781. print_info: f_norm_eps       = 0.0e+00
  782. print_info: f_norm_rms_eps   = 1.0e-06
  783. print_info: f_clamp_kqv      = 0.0e+00
  784. print_info: f_max_alibi_bias = 0.0e+00
  785. print_info: f_logit_scale    = 0.0e+00
  786. print_info: f_attn_scale     = 0.0e+00
  787. print_info: n_ff             = 11008
  788. print_info: n_expert         = 0
  789. print_info: n_expert_used    = 0
  790. print_info: causal attn      = 1
  791. print_info: pooling type     = -1
  792. print_info: rope type        = 2
  793. print_info: rope scaling     = linear
  794. print_info: freq_base_train  = 1000000.0
  795. print_info: freq_scale_train = 1
  796. print_info: n_ctx_orig_yarn  = 32768
  797. print_info: rope_finetuned   = unknown
  798. print_info: ssm_d_conv       = 0
  799. print_info: ssm_d_inner      = 0
  800. print_info: ssm_d_state      = 0
  801. print_info: ssm_dt_rank      = 0
  802. print_info: ssm_dt_b_c_rms   = 0
  803. print_info: model type       = 3B
  804. print_info: model params     = 3.09 B
  805. print_info: general.name     = Qwen2.5 3B Instruct
  806. print_info: vocab type       = BPE
  807. print_info: n_vocab          = 151936
  808. print_info: n_merges         = 151387
  809. print_info: BOS token        = 151643 '<|endoftext|>'
  810. print_info: EOS token        = 151645 '<|im_end|>'
  811. print_info: EOT token        = 151645 '<|im_end|>'
  812. print_info: PAD token        = 151643 '<|endoftext|>'
  813. print_info: LF token         = 198 'Ċ'
  814. print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
  815. print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
  816. print_info: FIM MID token    = 151660 '<|fim_middle|>'
  817. print_info: FIM PAD token    = 151662 '<|fim_pad|>'
  818. print_info: FIM REP token    = 151663 '<|repo_name|>'
  819. print_info: FIM SEP token    = 151664 '<|file_sep|>'
  820. print_info: EOG token        = 151643 '<|endoftext|>'
  821. print_info: EOG token        = 151645 '<|im_end|>'
  822. print_info: EOG token        = 151662 '<|fim_pad|>'
  823. print_info: EOG token        = 151663 '<|repo_name|>'
  824. print_info: EOG token        = 151664 '<|file_sep|>'
  825. print_info: max token length = 256
  826. load_tensors: loading model tensors, this can take a while... (mmap = false)
  827. load_tensors:          CPU model buffer size =  1834.82 MiB
  828. llama_context: constructing llama_context
  829. llama_context: n_seq_max     = 1
  830. llama_context: n_ctx         = 32768
  831. llama_context: n_ctx_per_seq = 32768
  832. llama_context: n_batch       = 512
  833. llama_context: n_ubatch      = 512
  834. llama_context: causal_attn   = 1
  835. llama_context: flash_attn    = 0
  836. llama_context: freq_base     = 1000000.0
  837. llama_context: freq_scale    = 1
  838. llama_context:        CPU  output buffer size =     0.59 MiB
  839. llama_kv_cache_unified: kv_size = 32768, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
  840. llama_kv_cache_unified:        CPU KV buffer size =  1152.00 MiB
  841. llama_kv_cache_unified: KV self size  = 1152.00 MiB, K (f16):  576.00 MiB, V (f16):  576.00 MiB
  842. llama_context:        CPU compute buffer size =  1104.01 MiB
  843. llama_context: graph nodes  = 1338
  844. llama_context: graph splits = 1
  845. time=2025-06-27T13:31:16.704-05:00 level=INFO source=server.go:630 msg="llama runner started in 5.81 seconds"
  846. [GIN] 2025/06/27 - 13:40:23 | 200 |            0s |       127.0.0.1 | HEAD     "/"
  847. [GIN] 2025/06/27 - 13:40:23 | 200 |            0s |       127.0.0.1 | GET      "/api/ps"
  848. [GIN] 2025/06/27 - 14:00:08 | 200 |         29m8s |       127.0.0.1 | POST     "/api/chat"
  849. time=2025-06-27T14:59:12.959-05:00 level=INFO source=server.go:135 msg="system memory" total="24.0 GiB" free="1.5 GiB" free_swap="4.8 GiB"
  850. time=2025-06-27T14:59:14.103-05:00 level=INFO source=server.go:168 msg=offload library=rocm layers.requested=-1 layers.model=29 layers.offload=0 layers.split="" memory.available="[290.7 MiB]" memory.gpu_overhead="0 B" memory.required.full="2.7 GiB" memory.required.partial="0 B" memory.required.kv="896.0 MiB" memory.required.allocations="[0 B]" memory.weights.total="1.9 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="308.2 MiB" memory.graph.full="424.0 MiB" memory.graph.partial="570.7 MiB"
  851. time=2025-06-27T14:59:14.103-05:00 level=WARN source=server.go:199 msg="flash attention enabled but not supported by gpu"
  852. llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff (version GGUF V3 (latest))
  853. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  854. llama_model_loader: - kv   0:                       general.architecture str              = llama
  855. llama_model_loader: - kv   1:                               general.type str              = model
  856. llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
  857. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  858. llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
  859. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  860. llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
  861. llama_model_loader: - kv   7:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
  862. llama_model_loader: - kv   8:                          llama.block_count u32              = 28
  863. llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
  864. llama_model_loader: - kv  10:                     llama.embedding_length u32              = 3072
  865. llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 8192
  866. llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 24
  867. llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
  868. llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
  869. llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
  870. llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
  871. llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
  872. llama_model_loader: - kv  18:                          general.file_type u32              = 15
  873. llama_model_loader: - kv  19:                           llama.vocab_size u32              = 128256
  874. llama_model_loader: - kv  20:                 llama.rope.dimension_count u32              = 128
  875. llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
  876. llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = llama-bpe
  877. llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  878. llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  879. llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
  880. llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 128000
  881. llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 128009
  882. llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
  883. llama_model_loader: - kv  29:               general.quantization_version u32              = 2
  884. llama_model_loader: - type  f32:   58 tensors
  885. llama_model_loader: - type q4_K:  168 tensors
  886. llama_model_loader: - type q6_K:   29 tensors
  887. print_info: file format = GGUF V3 (latest)
  888. print_info: file type   = Q4_K - Medium
  889. print_info: file size   = 1.87 GiB (5.01 BPW)
  890. load: special tokens cache size = 256
  891. load: token to piece cache size = 0.7999 MB
  892. print_info: arch             = llama
  893. print_info: vocab_only       = 1
  894. print_info: model type       = ?B
  895. print_info: model params     = 3.21 B
  896. print_info: general.name     = Llama 3.2 3B Instruct
  897. print_info: vocab type       = BPE
  898. print_info: n_vocab          = 128256
  899. print_info: n_merges         = 280147
  900. print_info: BOS token        = 128000 '<|begin_of_text|>'
  901. print_info: EOS token        = 128009 '<|eot_id|>'
  902. print_info: EOT token        = 128009 '<|eot_id|>'
  903. print_info: EOM token        = 128008 '<|eom_id|>'
  904. print_info: LF token         = 198 'Ċ'
  905. print_info: EOG token        = 128008 '<|eom_id|>'
  906. print_info: EOG token        = 128009 '<|eot_id|>'
  907. print_info: max token length = 256
  908. llama_model_load: vocab only - skipping tensors
  909. time=2025-06-27T14:59:14.720-05:00 level=INFO source=server.go:431 msg="starting llama server" cmd="C:\\\\dev\\\\ollama\\\\ollama\\\\ollama.exe runner --model C:\\\\Users\\\\mlt\\\\.ollama\\\\models\\\\blobs\\\\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff --ctx-size 8192 --batch-size 512 --threads 4 --no-mmap --parallel 1 --port 63871"
  910. time=2025-06-27T14:59:14.731-05:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
  911. time=2025-06-27T14:59:14.731-05:00 level=INFO source=server.go:591 msg="waiting for llama runner to start responding"
  912. time=2025-06-27T14:59:14.731-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server error"
  913. time=2025-06-27T14:59:14.921-05:00 level=INFO source=runner.go:815 msg="starting go runner"
  914. load_backend: loaded CPU backend from C:\dev\ollama\ollama\build\lib\ollama\ggml-cpu-sandybridge.dll
  915. time=2025-06-27T14:59:15.002-05:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.LLAMAFILE=1 CPU.1.SSE3=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
  916. time=2025-06-27T14:59:15.007-05:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:63871"
  917. llama_model_loader: loaded meta data with 30 key-value pairs and 255 tensors from C:\Users\mlt\.ollama\models\blobs\sha256-dde5aa3fc5ffc17176b5e8bdc82f587b24b2678c6c66101bf7da77af9f7ccdff (version GGUF V3 (latest))
  918. llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
  919. llama_model_loader: - kv   0:                       general.architecture str              = llama
  920. llama_model_loader: - kv   1:                               general.type str              = model
  921. llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
  922. llama_model_loader: - kv   3:                           general.finetune str              = Instruct
  923. llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
  924. llama_model_loader: - kv   5:                         general.size_label str              = 3B
  925. llama_model_loader: - kv   6:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
  926. llama_model_loader: - kv   7:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
  927. llama_model_loader: - kv   8:                          llama.block_count u32              = 28
  928. llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
  929. llama_model_loader: - kv  10:                     llama.embedding_length u32              = 3072
  930. llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 8192
  931. llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 24
  932. llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
  933. llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
  934. llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
  935. llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
  936. llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
  937. llama_model_loader: - kv  18:                          general.file_type u32              = 15
  938. llama_model_loader: - kv  19:                           llama.vocab_size u32              = 128256
  939. llama_model_loader: - kv  20:                 llama.rope.dimension_count u32              = 128
  940. llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
  941. llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = llama-bpe
  942. llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
  943. llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
  944. llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
  945. llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 128000
  946. llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 128009
  947. llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
  948. llama_model_loader: - kv  29:               general.quantization_version u32              = 2
  949. llama_model_loader: - type  f32:   58 tensors
  950. llama_model_loader: - type q4_K:  168 tensors
  951. llama_model_loader: - type q6_K:   29 tensors
  952. print_info: file format = GGUF V3 (latest)
  953. print_info: file type   = Q4_K - Medium
  954. print_info: file size   = 1.87 GiB (5.01 BPW)
  955. time=2025-06-27T14:59:15.237-05:00 level=INFO source=server.go:625 msg="waiting for server to become available" status="llm server loading model"
  956. load: special tokens cache size = 256
  957. load: token to piece cache size = 0.7999 MB
  958. print_info: arch             = llama
  959. print_info: vocab_only       = 0
  960. print_info: n_ctx_train      = 131072
  961. print_info: n_embd           = 3072
  962. print_info: n_layer          = 28
  963. print_info: n_head           = 24
  964. print_info: n_head_kv        = 8
  965. print_info: n_rot            = 128
  966. print_info: n_swa            = 0
  967. print_info: n_swa_pattern    = 1
  968. print_info: n_embd_head_k    = 128
  969. print_info: n_embd_head_v    = 128
  970. print_info: n_gqa            = 3
  971. print_info: n_embd_k_gqa     = 1024
  972. print_info: n_embd_v_gqa     = 1024
  973. print_info: f_norm_eps       = 0.0e+00
  974. print_info: f_norm_rms_eps   = 1.0e-05
  975. print_info: f_clamp_kqv      = 0.0e+00
  976. print_info: f_max_alibi_bias = 0.0e+00
  977. print_info: f_logit_scale    = 0.0e+00
  978. print_info: f_attn_scale     = 0.0e+00
  979. print_info: n_ff             = 8192
  980. print_info: n_expert         = 0
  981. print_info: n_expert_used    = 0
  982. print_info: causal attn      = 1
  983. print_info: pooling type     = 0
  984. print_info: rope type        = 0
  985. print_info: rope scaling     = linear
  986. print_info: freq_base_train  = 500000.0
  987. print_info: freq_scale_train = 1
  988. print_info: n_ctx_orig_yarn  = 131072
  989. print_info: rope_finetuned   = unknown
  990. print_info: ssm_d_conv       = 0
  991. print_info: ssm_d_inner      = 0
  992. print_info: ssm_d_state      = 0
  993. print_info: ssm_dt_rank      = 0
  994. print_info: ssm_dt_b_c_rms   = 0
  995. print_info: model type       = 3B
  996. print_info: model params     = 3.21 B
  997. print_info: general.name     = Llama 3.2 3B Instruct
  998. print_info: vocab type       = BPE
  999. print_info: n_vocab          = 128256
  1000. print_info: n_merges         = 280147
  1001. print_info: BOS token        = 128000 '<|begin_of_text|>'
  1002. print_info: EOS token        = 128009 '<|eot_id|>'
  1003. print_info: EOT token        = 128009 '<|eot_id|>'
  1004. print_info: EOM token        = 128008 '<|eom_id|>'
  1005. print_info: LF token         = 198 'Ċ'
  1006. print_info: EOG token        = 128008 '<|eom_id|>'
  1007. print_info: EOG token        = 128009 '<|eot_id|>'
  1008. print_info: max token length = 256
  1009. load_tensors: loading model tensors, this can take a while... (mmap = false)
  1010. load_tensors:          CPU model buffer size =  1918.35 MiB
  1011. llama_context: constructing llama_context
  1012. llama_context: n_seq_max     = 1
  1013. llama_context: n_ctx         = 8192
  1014. llama_context: n_ctx_per_seq = 8192
  1015. llama_context: n_batch       = 512
  1016. llama_context: n_ubatch      = 512
  1017. llama_context: causal_attn   = 1
  1018. llama_context: flash_attn    = 0
  1019. llama_context: freq_base     = 500000.0
  1020. llama_context: freq_scale    = 1
  1021. llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
  1022. llama_context:        CPU  output buffer size =     0.50 MiB
  1023. llama_kv_cache_unified: kv_size = 8192, type_k = 'f16', type_v = 'f16', n_layer = 28, can_shift = 1, padding = 32
  1024. llama_kv_cache_unified:        CPU KV buffer size =   896.00 MiB
  1025. llama_kv_cache_unified: KV self size  =  896.00 MiB, K (f16):  448.00 MiB, V (f16):  448.00 MiB
  1026. llama_context:        CPU compute buffer size =   424.01 MiB
  1027. llama_context: graph nodes  = 958
  1028. llama_context: graph splits = 1
  1029. time=2025-06-27T14:59:21.273-05:00 level=INFO source=server.go:630 msg="llama runner started in 6.54 seconds"
  1030. [GIN] 2025/06/27 - 14:59:39 | 200 |       1.691ms |       127.0.0.1 | HEAD     "/"
  1031. [GIN] 2025/06/27 - 14:59:39 | 200 |      1.0781ms |       127.0.0.1 | GET      "/api/ps"
  1032. time=2025-06-27T15:00:05.182-05:00 level=ERROR source=server.go:800 msg="post predict" error="Post \\"http://127.0.0.1:63871/completion\": context canceled"
  1033. [GIN] 2025/06/27 - 15:00:05 | 200 |          1m0s |       127.0.0.1 | POST     "/api/chat"
  1034.  
captcha