USER

given the vim plugin delimited by triple backicks: " vim: ts=4 sts=4 expandtab " colors (adjust to your liking) highlight default llama_hl_hint guifg=#ff772f ctermfg=202 highlight default llama_hl_info guifg=#77ff2f ctermfg=119 " general parameters: " " endpoint: llama.cpp server endpoint " api_key: llama.cpp server api key (optional) " n_prefix: number of lines before the cursor location to include in the local prefix " n_suffix: number of lines after the cursor location to include in the local suffix " n_predict: max number of tokens to predict " stop_strings return the result immediately as soon as any of these strings are encountered in the generated text " t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) " t_max_predict_ms: max alloted time for the prediction " show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) " auto_fim: trigger FIM completion automatically on cursor movement " max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor " max_cache_keys: max number of cached completions to keep in result_cache " " ring buffer of chunks, accumulated with time upon: " " - completion request " - yank " - entering a buffer " - leaving a buffer " - writing a file " " parameters for the ring-buffer with extra context: " " ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) " ring_chunk_size: max size of the chunks (in number of lines) " note: adjust these numbers so that you don't overrun your context " at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context " ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM " ring_update_ms: how often to process queued chunks in normal mode " " keymaps parameters: " " keymap_trigger: keymap to trigger the completion, default: <C-F> " keymap_accept_full: keymap to accept full suggestion, default: <Tab> " keymap_accept_line: keymap to accept line suggestion, default: <S-Tab> " keymap_accept_word: keymap to accept word suggestion, default: <C-B> " let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', \ 'api_key': '', \ 'n_prefix': 256, \ 'n_suffix': 64, \ 'n_predict': 128, \ 'stop_strings': [], \ 't_max_prompt_ms': 500, \ 't_max_predict_ms': 1000, \ 'show_info': 2, \ 'auto_fim': v:true, \ 'max_line_suffix': 8, \ 'max_cache_keys': 250, \ 'ring_n_chunks': 16, \ 'ring_chunk_size': 64, \ 'ring_scope': 1024, \ 'ring_update_ms': 1000, \ 'keymap_trigger': "<C-F>", \ 'keymap_accept_full': "<Tab>", \ 'keymap_accept_line': "<S-Tab>", \ 'keymap_accept_word': "<C-B>", \ } let llama_config = get(g:, 'llama_config', s:default_config) let g:llama_config = extendnew(s:default_config, llama_config, 'force') let s:llama_enabled = v:true " containes cached responses from the server " used to avoid re-computing the same completions and to also create new completions with similar context " ref: https://github.com/ggml-org/llama.vim/pull/18 let g:cache_data = {} " TODO: Currently the cache uses a random eviction policy. A more clever policy could be implemented (eg. LRU). function! s:cache_insert(key, value) if len(keys(g:cache_data)) > (g:llama_config.max_cache_keys - 1) let l:keys = keys(g:cache_data) let l:hash = l:keys[rand() % len(l:keys)] call remove(g:cache_data, l:hash) endif let g:cache_data[a:key] = a:value endfunction " get the number of leading spaces of a string function! s:get_indent(str) let l:count = 0 for i in range(len(a:str)) if a:str[i] == "\t" let l:count += &tabstop - 1 else break endif endfor return l:count endfunction function! s:rand(i0, i1) abort return a:i0 + rand() % (a:i1 - a:i0 + 1) endfunction function! llama#disable() call llama#fim_hide() autocmd! llama exe "silent! iunmap " .. g:llama_config.keymap_trigger endfunction function! llama#toggle() if s:llama_enabled call llama#disable() else call llama#init() endif let s:llama_enabled = !s:llama_enabled endfunction function llama#setup_commands() command! LlamaEnable call llama#init() command! LlamaDisable call llama#disable() command! LlamaToggle call llama#toggle() endfunction function! llama#init() if !executable('curl') echohl WarningMsg echo 'llama.vim requires the "curl" command to be available' echohl None return endif call llama#setup_commands() let s:fim_data = {} let s:ring_chunks = [] " current set of chunks used as extra context let s:ring_queued = [] " chunks that are queued to be sent for processing let s:ring_n_evict = 0 let s:hint_shown = v:false let s:pos_y_pick = -9999 " last y where we picked a chunk let s:indent_last = -1 " last indentation level that was accepted (TODO: this might be buggy) let s:timer_fim = -1 let s:t_last_move = reltime() " last time the cursor moved let s:current_job = v:null let s:ghost_text_nvim = exists('*nvim_buf_get_mark') let s:ghost_text_vim = has('textprop') if s:ghost_text_vim if version < 901 echom 'Warning: llama.vim requires version 901 or greater. Current version: ' . version endif let s:hlgroup_hint = 'llama_hl_hint' let s:hlgroup_info = 'llama_hl_info' if empty(prop_type_get(s:hlgroup_hint)) call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint}) endif if empty(prop_type_get(s:hlgroup_info)) call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info}) endif endif augroup llama autocmd! exe "autocmd InsertEnter _inoremap <expr> <silent> " .. g:llama_config.keymap_trigger .. " llama#fim_inline(v:false, v:false)" autocmd InsertLeavePre_ call llama#fim_hide() autocmd CursorMoved _call s:on_move() autocmd CursorMovedI_ call s:on_move() autocmd CompleteChanged _call llama#fim_hide() autocmd CompleteDone_ call s:on_move() if g:llama_config.auto_fim autocmd CursorMovedI _call llama#fim(-1, -1, v:true, [], v:true) endif " gather chunks upon yanking autocmd TextYankPost_ if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif " gather chunks upon entering/leaving a buffer autocmd BufEnter _call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) autocmd BufLeave_ call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) " gather chunk upon saving the file autocmd BufWritePost _call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) augroup END silent! call llama#fim_hide() " init background update of the ring buffer if g:llama_config.ring_n_chunks > 0 call s:ring_update() endif endfunction " compute how similar two chunks of text are " 0 - no similarity, 1 - high similarity " TODO: figure out something better function! s:chunk_sim(c0, c1) let l:lines0 = len(a:c0) let l:lines1 = len(a:c1) let l:common = 0 for l:line0 in a:c0 for l:line1 in a:c1 if l:line0 == l:line1 let l:common += 1 break endif endfor endfor return 2.0_ l:common / (l:lines0 + l:lines1) endfunction " pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing " " no_mod - do not pick chunks from buffers with pending changes " do_evict - evict chunks that are very similar to the new one " function! s:pick_chunk(text, no_mod, do_evict) " do not pick chunks from buffers with pending changes or buffers that are not files if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) return endif " if the extra context option is disabled - do nothing if g:llama_config.ring_n_chunks <= 0 return endif " don't pick very small chunks if len(a:text) < 3 return endif if len(a:text) + 1 < g:llama_config.ring_chunk_size let l:chunk = a:text else let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2])) let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)]) let l:chunk = a:text[l:l0:l:l1] endif let l:chunk_str = join(l:chunk, "\n") . "\n" " check if this chunk is already added let l:exist = v:false for i in range(len(s:ring_chunks)) if s:ring_chunks[i].data == l:chunk let l:exist = v:true break endif endfor for i in range(len(s:ring_queued)) if s:ring_queued[i].data == l:chunk let l:exist = v:true break endif endfor if l:exist return endif " evict queued chunks that are very similar to the new one for i in range(len(s:ring_queued) - 1, 0, -1) if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9 if a:do_evict call remove(s:ring_queued, i) let s:ring_n_evict += 1 else return endif endif endfor " also from s:ring_chunks for i in range(len(s:ring_chunks) - 1, 0, -1) if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 if a:do_evict call remove(s:ring_chunks, i) let s:ring_n_evict += 1 else return endif endif endfor " TODO: become parameter ? if len(s:ring_queued) == 16 call remove(s:ring_queued, 0) endif call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) endfunction " picks a queued chunk, sends it for processing and adds it to s:ring_chunks " called every g:llama_config.ring_update_ms function! s:ring_update() call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) " update only if in normal mode or if the cursor hasn't moved for a while if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0 return endif if len(s:ring_queued) 0 return endif " move the first queued chunk to the ring buffer if len(s:ring_chunks) g:llama_config.ring_n_chunks call remove(s:ring_chunks, 0) endif call add(s:ring_chunks, remove(s:ring_queued, 0)) "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) " send asynchronous job with the new extra context so that it is ready for the next FIM let l:extra_context = [] for l:chunk in s:ring_chunks call add(l:extra_context, { \ 'text': l:chunk.str, \ 'time': l:chunk.time, \ 'filename': l:chunk.filename \ }) endfor " no samplers needed here let l:request = json_encode({ \ 'input_prefix': "", \ 'input_suffix': "", \ 'input_extra': l:extra_context, \ 'prompt': "", \ 'n_predict': 0, \ 'temperature': 0.0, \ 'stream': v:false, \ 'samplers': [], \ 'cache_prompt': v:true, \ 't_max_prompt_ms': 1, \ 't_max_predict_ms': 1, \ 'response_fields': [""] \ }) let l:curl_command = [ \ "curl", \ "--silent", \ "--no-buffer", \ "--request", "POST", \ "--url", g:llama_config.endpoint, \ "--header", "Content-Type: application/json", \ "--data", "@-", \ ] if exists ("g:llama_config.api_key") && len("g:llama_config.api_key") > 0 call extend(l:curl_command, ['--header', 'Authorization: Bearer ' .. g:llama_config.api_key]) endif " no callbacks because we don't need to process the response if s:ghost_text_nvim let jobid = jobstart(l:curl_command, {}) call chansend(jobid, l:request) call chanclose(jobid, 'stdin') elseif s:ghost_text_vim let jobid = job_start(l:curl_command, {}) let channel = job_getchannel(jobid) call ch_sendraw(channel, l:request) call ch_close_in(channel) endif endfunction " get the local context at a specified position " a:prev can optionally contain a previous completion for this position " in such cases, create the local context as if the completion was already inserted function! s:fim_ctx_local(pos_x, pos_y, prev) let l:max_y = line('$') if empty(a:prev) let l:line_cur = getline(a:pos_y) let l:line_cur_prefix = strpart(l:line_cur, 0, a:pos_x) let l:line_cur_suffix = strpart(l:line_cur, a:pos_x) let l:lines_prefix = getline(max([1, a:pos_y - g:llama_config.n_prefix]), a:pos_y - 1) let l:lines_suffix = getline(a:pos_y + 1, min([l:max_y, a:pos_y + g:llama_config.n_suffix])) " special handling of lines full of whitespaces - start from the beginning of the line if match(l:line_cur, '^\s*$') >= 0 let l:indent = 0 let l:line_cur_prefix = "" let l:line_cur_suffix = "" else " the indentation of the current line let l:indent = strlen(matchstr(l:line_cur, '^\s*')) endif else if len(a:prev) == 1 let l:line_cur = getline(a:pos_y) . a:prev[0] else let l:line_cur = a:prev[-1] endif let l:line_cur_prefix = l:line_cur let l:line_cur_suffix = "" let l:lines_prefix = getline(max([1, a:pos_y - g:llama_config.n_prefix + len(a:prev) - 1]), a:pos_y - 1) if len(a:prev) > 1 call add(l:lines_prefix, getline(a:pos_y) . a:prev[0]) for l:line in a:prev[1:-2] call add(l:lines_prefix, l:line) endfor endif let l:lines_suffix = getline(a:pos_y + 1, min([l:max_y, a:pos_y + g:llama_config.n_suffix])) let l:indent = s:indent_last endif let l:prefix = "" \ . join(l:lines_prefix, "\n") \ . "\n" let l:middle = "" \ . l:line_cur_prefix let l:suffix = "" \ . l:line_cur_suffix \ . "\n" \ . join(l:lines_suffix, "\n") \ . "\n" let l:res = {} let l:res['prefix'] = l:prefix let l:res['middle'] = l:middle let l:res['suffix'] = l:suffix let l:res['indent'] = l:indent let l:res['line_cur'] = l:line_cur let l:res['line_cur_prefix'] = l:line_cur_prefix let l:res['line_cur_suffix'] = l:line_cur_suffix return l:res endfunction " necessary for 'inoremap <expr>' function! llama#fim_inline(is_auto, use_cache) abort " we already have a suggestion displayed - hide it if s:hint_shown && !a:is_auto call llama#fim_hide() return '' endif call llama#fim(-1, -1, a:is_auto, [], a:use_cache) return '' endfunction " the main FIM call " takes local context around the cursor and sends it together with the extra context to the server for completion function! llama#fim(pos_x, pos_y, is_auto, prev, use_cache) abort let l:pos_x = a:pos_x let l:pos_y = a:pos_y if l:pos_x < 0 let l:pos_x = col('.') - 1 endif if l:pos_y < 0 let l:pos_y = line('.') endif " avoid sending repeated requests too fast if s:current_job != v:null if s:timer_fim != -1 call timer_stop(s:timer_fim) let s:timer_fim = -1 endif let s:timer_fim = timer_start(100, {-> llama#fim(a:pos_x, a:pos_y, v:true, a:prev, a:use_cache)}) return endif "if s:hint_shown && empty(a:prev) " return "endif "let s:t_fim_start = reltime() let l:ctx_local = s:fim_ctx_local(l:pos_x, l:pos_y, a:prev) let l:prefix = l:ctx_local['prefix'] let l:middle = l:ctx_local['middle'] let l:suffix = l:ctx_local['suffix'] let l:indent = l:ctx_local['indent'] if a:is_auto && len(l:ctx_local['line_cur_suffix']) > g:llama_config.max_line_suffix return endif let l:t_max_predict_ms = g:llama_config.t_max_predict_ms if empty(a:prev) " the first request is quick - we will launch a speculative request after this one is displayed let l:t_max_predict_ms = 250 endif " compute multiple hashes that can be used to generate a completion for which the " first few lines are missing. this happens when we have scrolled down a bit from where the original " generation was done " let l:hashes = [] call add(l:hashes, sha256(l:prefix . l:middle . 'Î' . l:suffix)) let l:prefix_trim = l:prefix for i in range(3) let l:prefix_trim = substitute(l:prefix_trim, '^[^\n]*\n', '', '') if empty(l:prefix_trim) break endif call add(l:hashes, sha256(l:prefix_trim . l:middle . 'Î' . l:suffix)) endfor " if we already have a cached completion for one of the hashes, don't send a request if a:use_cache for l:hash in l:hashes if get(g:cache_data, l:hash, v:null) != v:null return endif endfor endif " TODO: this might be incorrect let s:indent_last = l:indent " TODO: refactor in a function let l:text = getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])) let l:l0 = s:rand(0, max([0, len(l:text) - g:llama_config.ring_chunk_size/2])) let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(l:text)]) let l:chunk = l:text[l:l0:l:l1] " evict chunks that are very similar to the current context " this is needed because such chunks usually distort the completion to repeat what was already there for i in range(len(s:ring_chunks) - 1, 0, -1) if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5 call remove(s:ring_chunks, i) let s:ring_n_evict += 1 endif endfor " prepare the extra context data let l:extra_ctx = [] for l:chunk in s:ring_chunks call add(l:extra_ctx, { \ 'text': l:chunk.str, \ 'time': l:chunk.time, \ 'filename': l:chunk.filename \ }) endfor let l:request = json_encode({ \ 'input_prefix': l:prefix, \ 'input_suffix': l:suffix, \ 'input_extra': l:extra_ctx, \ 'prompt': l:middle, \ 'n_predict': g:llama_config.n_predict, \ 'stop': g:llama_config.stop_strings, \ 'n_indent': l:indent, \ 'top_k': 40, \ 'top_p': 0.90, \ 'stream': v:false, \ 'samplers': ["top_k", "top_p", "infill"], \ 'cache_prompt': v:true, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, \ 't_max_predict_ms': l:t_max_predict_ms, \ 'response_fields': [ \ "content", \ "timings/prompt_n", \ "timings/prompt_ms", \ "timings/prompt_per_token_ms", \ "timings/prompt_per_second", \ "timings/predicted_n", \ "timings/predicted_ms", \ "timings/predicted_per_token_ms", \ "timings/predicted_per_second", \ "truncated", \ "tokens_cached", \ ], \ }) let l:curl_command = [ \ "curl", \ "--silent", \ "--no-buffer", \ "--request", "POST", \ "--url", g:llama_config.endpoint, \ "--header", "Content-Type: application/json", \ "--data", "@-", \ ] if exists ("g:llama_config.api_key") && len("g:llama_config.api_key") > 0 call extend(l:curl_command, ['--header', 'Authorization: Bearer ' .. g:llama_config.api_key]) endif if s:current_job != v:null if s:ghost_text_nvim call jobstop(s:current_job) elseif s:ghost_text_vim call job_stop(s:current_job) endif endif " send the request asynchronously if s:ghost_text_nvim let s:current_job = jobstart(l:curl_command, { \ 'on_stdout': function('s:fim_on_response', [l:hashes]), \ 'on_exit': function('s:fim_on_exit'), \ 'stdout_buffered': v:true \ }) call chansend(s:current_job, l:request) call chanclose(s:current_job, 'stdin') elseif s:ghost_text_vim let s:current_job = job_start(l:curl_command, { \ 'out_cb': function('s:fim_on_response', [l:hashes]), \ 'exit_cb': function('s:fim_on_exit') \ }) let channel = job_getchannel(s:current_job) call ch_sendraw(channel, l:request) call ch_close_in(channel) endif " TODO: per-file location let l:delta_y = abs(l:pos_y - s:pos_y_pick) " gather some extra context nearby and process it in the background " only gather chunks if the cursor has moved a lot " TODO: something more clever? reranking? if a:is_auto && l:delta_y > 32 let l:max_y = line('$') " expand the prefix even further call s:pick_chunk(getline(max([1, l:pos_y - g:llama_config.ring_scope]), max([1, l:pos_y - g:llama_config.n_prefix])), v:false, v:false) " pick a suffix chunk call s:pick_chunk(getline(min([l:max_y, l:pos_y + g:llama_config.n_suffix]), min([l:max_y, l:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) let s:pos_y_pick = l:pos_y endif endfunction " callback that processes the FIM result from the server function! s:fim_on_response(hashes, job_id, data, event = v:null) if s:ghost_text_nvim let l:raw = join(a:data, "\n") elseif s:ghost_text_vim let l:raw = a:data endif " ignore empty results if len(l:raw) == 0 return endif " ensure the response is valid JSON, starting with a fast check before full decode if l:raw !~# '^\s*{' || l:raw !~# '\v"content"\s*:"' return endif try let l:response = json_decode(l:raw) catch return endtry " put the response in the cache for l:hash in a:hashes call s:cache_insert(l:hash, l:raw) endfor " if nothing is currently displayed - show the hint directly if !s:hint_shown || !s:fim_data['can_accept'] let l:pos_x = col('.') - 1 let l:pos_y = line('.') call s:fim_try_hint(l:pos_x, l:pos_y) endif endfunction function! s:fim_on_exit(job_id, exit_code, event = v:null) if a:exit_code != 0 echom "Job failed with exit code: " . a:exit_code endif let s:current_job = v:null endfunction function! s:on_move() let s:t_last_move = reltime() call llama#fim_hide() let l:pos_x = col('.') - 1 let l:pos_y = line('.') call s:fim_try_hint(l:pos_x, l:pos_y) endfunction " try to generate a suggestion using the data in the cache function! s:fim_try_hint(pos_x, pos_y) " show the suggestion only in insert mode if mode() !~# '\v^(i|ic|ix)$' return endif let l:pos_x = a:pos_x let l:pos_y = a:pos_y let l:ctx_local = s:fim_ctx_local(l:pos_x, l:pos_y, []) let l:prefix = l:ctx_local['prefix'] let l:middle = l:ctx_local['middle'] let l:suffix = l:ctx_local['suffix'] let l:hash = sha256(l:prefix . l:middle . 'Î' . l:suffix) " Check if the completion is cached let l:raw = get(g:cache_data, l:hash, v:null) " ... or if there is a cached completion nearby (10 characters behind) " Looks at the previous 10 characters to see if a completion is cached. If one is found at (x,y) " then it checks that the characters typed after (x,y) match up with the cached completion result. if l:raw == v:null let l:pm = l:prefix . l:middle let l:best = 0 for i in range(128) let l:removed = l:pm[-(1 + i):] let l:ctx_new = l:pm[:-(2 + i)] . 'Î' . l:suffix let l:hash_new = sha256(l:ctx_new) if has_key(g:cache_data, l:hash_new) let l:response_cached = get(g:cache_data, l:hash_new) if l:response_cached == "" continue endif let l:response = json_decode(l:response_cached) if l:response['content'][0:i] !=# l:removed continue endif let l:response['content'] = l:response['content'][i + 1:] if len(l:response['content']) > 0 if l:raw == v:null let l:raw = json_encode(l:response) elseif len(l:response['content']) > l:best let l:best = len(l:response['content']) let l:raw = json_encode(l:response) endif endif endif endfor endif if l:raw != v:null call s:fim_render(l:pos_x, l:pos_y, l:raw) " run async speculative FIM in the background for this position if s:hint_shown call llama#fim(l:pos_x, l:pos_y, v:true, s:fim_data['content'], v:true) endif endif endfunction " render a suggestion at the current cursor location function! s:fim_render(pos_x, pos_y, data) " do not show if there is a completion in progress if pumvisible() return endif let l:raw = a:data let l:can_accept = v:true let l:has_info = v:false let l:n_prompt = 0 let l:t_prompt_ms = 1.0 let l:s_prompt = 0 let l:n_predict = 0 let l:t_predict_ms = 1.0 let l:s_predict = 0 let l:content = [] " get the generated suggestion if l:can_accept let l:response = json_decode(l:raw) for l:part in split(get(l:response, 'content', ''), "\n", 1) call add(l:content, l:part) endfor " remove trailing new lines while len(l:content) > 0 && l:content[-1] == "" call remove(l:content, -1) endwhile let l:n_cached = get(l:response, 'tokens_cached', 0) let l:truncated = get(l:response, 'timings/truncated', v:false) " if response.timings is available if has_key(l:response, 'timings/prompt_n') && has_key(l:response, 'timings/prompt_ms') && has_key(l:response, 'timings/prompt_per_second') \ && has_key(l:response, 'timings/predicted_n') && has_key(l:response, 'timings/predicted_ms') && has_key(l:response, 'timings/predicted_per_second') let l:n_prompt = get(l:response, 'timings/prompt_n', 0) let l:t_prompt_ms = get(l:response, 'timings/prompt_ms', 1) let l:s_prompt = get(l:response, 'timings/prompt_per_second', 0) let l:n_predict = get(l:response, 'timings/predicted_n', 0) let l:t_predict_ms = get(l:response, 'timings/predicted_ms', 1) let l:s_predict = get(l:response, 'timings/predicted_per_second', 0) endif let l:has_info = v:true endif if len(l:content) == 0 call add(l:content, "") let l:can_accept = v:false endif let l:pos_x = a:pos_x let l:pos_y = a:pos_y let l:line_cur = getline(l:pos_y) " if the current line is full of whitespaces, trim as much whitespaces from the suggestion if match(l:line_cur, '^\s*$') >= 0 let l:lead = min([strlen(matchstr(l:content[0], '^\s*')), strlen(l:line_cur)]) let l:line_cur = strpart(l:content[0], 0, l:lead) let l:content[0] = strpart(l:content[0], l:lead) endif let l:line_cur_prefix = strpart(l:line_cur, 0, l:pos_x) let l:line_cur_suffix = strpart(l:line_cur, l:pos_x) " NOTE: the following is logic for discarding predictions that repeat existing text " the code is quite ugly and there is very likely a simpler and more canonical way to implement this " " still, I wonder if there is some better way that avoids having to do these special hacks? " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would " start generating whatever we have given it via the extra context. but on the other hand, it's not very " helpful to re-generate the same code that is already there " truncate the suggestion if the first line is empty if len(l:content) 1 && l:content[0] "" let l:content = [""] endif " ... and the next lines are repeated if len(l:content) > 1 && l:content[0] "" && l:content[1:] getline(l:pos_y + 1, l:pos_y + len(l:content) - 1) let l:content = [""] endif " truncate the suggestion if it repeats the suffix if len(l:content) 1 && l:content[0] l:line_cur_suffix let l:content = [""] endif " find the first non-empty line (strip whitespace) let l:cmp_y = l:pos_y + 1 while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$' let l:cmp_y += 1 endwhile if (l:line_cur_prefix . l:content[0]) getline(l:cmp_y) " truncate the suggestion if it repeats the next line if len(l:content) 1 let l:content = [""] endif " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1 if len(l:content) 2 && l:content[-1] getline(l:cmp_y + 1)[:len(l:content[-1]) - 1] let l:content = [""] endif " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(l:content) - 1) if len(l:content) > 2 && join(l:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(l:content) - 1), "\n") let l:content = [""] endif endif " keep only lines that have the same or larger whitespace prefix as l:line_cur_prefix "let l:indent = strlen(matchstr(l:line_cur_prefix, '^\s*')) "for i in range(1, len(l:content) - 1) " if strlen(matchstr(l:content[i], '^\s*')) < l:indent " let l:content = l:content[:i - 1] " break " endif "endfor let l:content[-1] .= l:line_cur_suffix " if only whitespaces - do not accept if join(l:content, "\n") =~? '^\s*$' let l:can_accept = v:false endif " display virtual text with the suggestion let l:bufnr = bufnr('%') if s:ghost_text_nvim let l:id_vt_fim = nvim_create_namespace('vt_fim') endif let l:info = '' " construct the info message if g:llama_config.show_info > 0 && l:has_info let l:prefix = ' ' if l:truncated let l:info = printf("%s | WARNING: the context is full: %d, increase the server context size or reduce g:llama_config.ring_n_chunks", \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', \ l:n_cached \ ) else let l:info = printf("%s | c: %d, r: %d/%d, e: %d, q: %d/16, C: %d/%d | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s)", \ g:llama_config.show_info 2 ? l:prefix : 'llama.vim', \ l:n_cached, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), \ len(keys(g:cache_data)), g:llama_config.max_cache_keys, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict \ ) endif if g:llama_config.show_info 1 " display the info in the statusline let &statusline = l:info let l:info = '' endif endif " display the suggestion and append the info to the end of the first line if s:ghost_text_nvim call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, l:pos_x, { \ 'virt_text': [[l:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']], \ 'virt_text_pos': l:content == [""] ? 'eol' : 'overlay' \ }) call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, 0, { \ 'virt_lines': map(l:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}) \ }) elseif s:ghost_text_vim let l:full_suffix = l:content[0] if !empty(l:full_suffix) let l:new_suffix = l:full_suffix[0:-len(l:line_cur[l:pos_x:])-1] call prop_add(l:pos_y, l:pos_x + 1, { \ 'type': s:hlgroup_hint, \ 'text': l:new_suffix \ }) endif for line in l:content[1:] call prop_add(l:pos_y, 0, { \ 'type': s:hlgroup_hint, \ 'text': line, \ 'text_padding_left': s:get_indent(line), \ 'text_align': 'below' \ }) endfor if !empty(l:info) call prop_add(l:pos_y, 0, { \ 'type': s:hlgroup_info, \ 'text': l:info, \ 'text_wrap': 'truncate' \ }) endif endif " setup accept shortcuts exe 'inoremap <buffer> ' . g:llama_config.keymap_accept_full . ' <C-O>:call llama#fim_accept(''full'')<CR>' exe 'inoremap <buffer> ' . g:llama_config.keymap_accept_line . ' <C-O>:call llama#fim_accept(''line'')<CR>' exe 'inoremap <buffer> ' . g:llama_config.keymap_accept_word . ' <C-O>:call llama#fim_accept(''word'')<CR>' let s:hint_shown = v:true let s:fim_data['pos_x'] = l:pos_x let s:fim_data['pos_y'] = l:pos_y let s:fim_data['line_cur'] = l:line_cur let s:fim_data['can_accept'] = l:can_accept let s:fim_data['content'] = l:content endfunction " if accept_type 'full', accept entire response " if accept_type 'line', accept only the first line of the response " if accept_type == 'word', accept only the first word of the response function! llama#fim_accept(accept_type) let l:pos_x = s:fim_data['pos_x'] let l:pos_y = s:fim_data['pos_y'] let l:line_cur = s:fim_data['line_cur'] let l:can_accept = s:fim_data['can_accept'] let l:content = s:fim_data['content'] if l:can_accept && len(l:content) > 0 " insert suggestion on current line if a:accept_type != 'word' " insert first line of suggestion call setline(l:pos_y, l:line_cur[:(l:pos_x - 1)] . l:content[0]) else " insert first word of suggestion let l:suffix = l:line_cur[(l:pos_x):] let l:word = matchstr(l:content[0][:-(len(l:suffix) + 1)], '^\s*\S\+') call setline(l:pos_y, l:line_cur[:(l:pos_x - 1)] . l:word . l:suffix) endif " insert rest of suggestion if len(l:content) > 1 && a:accept_type 'full' call append(l:pos_y, l:content[1:-1]) endif " move cusor if a:accept_type 'word' " move cursor to end of word call cursor(l:pos_y, l:pos_x + len(l:word) + 1) elseif a:accept_type 'line' || len(l:content) 1 " move cursor for 1-line suggestion call cursor(l:pos_y, l:pos_x + len(l:content[0]) + 1) if len(l:content) > 2 " simulate pressing Enter to move to next line call feedkeys("\<CR>") endif else " move cursor for multi-line suggestion call cursor(l:pos_y + len(l:content) - 1, len(l:content[-1]) + 1) endif endif call llama#fim_hide() endfunction function! llama#fim_hide() let s:hint_shown = v:false " clear the virtual text let l:bufnr = bufnr('%') if s:ghost_text_nvim let l:id_vt_fim = nvim_create_namespace('vt_fim') call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) elseif s:ghost_text_vim call prop_remove({'type': s:hlgroup_hint, 'all': v:true}) call prop_remove({'type': s:hlgroup_info, 'all': v:true}) endif " remove the mappings exe 'silent! iunmap <buffer> ' . g:llama_config.keymap_accept_full exe 'silent! iunmap <buffer> ' . g:llama_config.keymap_accept_line exe 'silent! iunmap <buffer> ' . g:llama_config.keymap_accept_word endfunction

can you create a similar Qt Creator plugin ?

Qwen3 Coder 30b

Qt Creator Plugin for LLaMA Infill Completion

I’ll create a Qt Creator plugin that provides similar functionality to the vim plugin you’ve shared. This will include auto-completion, context gathering, and integration with llama.cpp server.

Plugin Structure

The plugin will consist of:

  1. Core plugin class with initialization
  2. Auto-completion system with FIM (Fill-In-the-Middle)
  3. Context gathering and caching mechanisms
  4. Integration with Qt Creator’s editor framework
  5. Configuration UI

Let me implement this as a complete plugin:

// llama_plugin.h
#ifndef LLAMA_PLUGIN_H
#define LLAMA_PLUGIN_H

#include <extensionsystem/iplugin.h>
#include <QTimer>
#include <QJsonObject>
#include <QJsonArray>
#include <QNetworkAccessManager>
#include <QNetworkRequest>
#include <QNetworkReply>
#include <QSettings>

namespace Llama {
namespace Internal {

class LlamaPlugin : public ExtensionSystem::IPlugin
{
    Q_OBJECT
    Q_PLUGIN_METADATA(IID "org.qt-project.Qt.QtCreatorPlugin" FILE "LlamaPlugin.json")

public:
    LlamaPlugin();
    ~LlamaPlugin();

    bool initialize(const QStringList &arguments, QString *errorString) override;
    void extensionsInitialized() override;
    ShutdownFlag aboutToShutdown() override;

private slots:
    void handleEditorOpened(Core::IEditor *editor);
    void handleEditorClosed(Core::IEditor *editor);
    void handleCursorPositionChanged();
    void handleTextChanged();
    void checkForCompletion();

private:
    void setupConfiguration();
    void loadConfiguration();
    void saveConfiguration();
    
    // Completion handling
    void requestCompletion(int pos_x, int pos_y, bool isAuto = false);
    void processCompletionResponse(const QString &response);
    void showCompletionHint(const QString &content, int pos_x, int pos_y);
    void hideCompletionHint();
    
    // Context management
    QString getLocalContext(int pos_x, int pos_y, const QString &prev = QString());
    void gatherExtraContext();
    void processExtraContext();
    
    // Configuration
    struct Config {
        QString endpoint;
        QString apiKey;
        int nPrefix;
        int nSuffix;
        int nPredict;
        QStringList stopStrings;
        int tMaxPromptMs;
        int tMaxPredictMs;
        int showInfo;
        bool autoFim;
        int maxLineSuffix;
        int maxCacheKeys;
        int ringNChunks;
        int ringChunkSize;
        int ringScope;
        int ringUpdateMs;
        QString keymapTrigger;
        QString keymapAcceptFull;
        QString keymapAcceptLine;
        QString keymapAcceptWord;
    } m_config;
    
    // Caching
    QHash<QString, QString> m_cacheData;
    
    // Context chunks
    struct Chunk {
        QString data;
        QDateTime time;
        QString filename;
    };
    QList<Chunk> m_ringChunks;
    QList<Chunk> m_ringQueued;
    int m_ringNEvict;
    
    // State tracking
    bool m_hintShown;
    QPoint m_lastPos;
    int m_indentLast;
    QTimer *m_completionTimer;
    QNetworkAccessManager *m_networkManager;
    QString m_currentRequestId;
    
    // Editor tracking
    Core::IEditor *m_currentEditor;
    QTextDocument *m_currentDocument;
};

} // namespace Internal
} // namespace Llama

#endif // LLAMA_PLUGIN_H
// llama_plugin.cpp
#include "llama_plugin.h"
#include "llama_settings.h"

#include <coreplugin/editormanager/editormanager.h>
#include <coreplugin/editormanager/ieditor.h>
#include <coreplugin/documentmanager.h>
#include <texteditor/texteditor.h>
#include <texteditor/textdocument.h>
#include <utils/qtcassert.h>
#include <utils/fileutils.h>
#include <QSettings>
#include <QDir>
#include <QApplication>
#include <QJsonDocument>
#include <QJsonObject>
#include <QJsonArray>
#include <QNetworkRequest>
#include <QNetworkReply>
#include <QTimer>
#include <QMessageBox>
#include <QDebug>

using namespace Core;
using namespace TextEditor;

namespace Llama {
namespace Internal {

LlamaPlugin::LlamaPlugin()
    : m_hintShown(false)
    , m_ringNEvict(0)
    , m_currentEditor(nullptr)
    , m_currentDocument(nullptr)
    , m_completionTimer(new QTimer(this))
    , m_networkManager(new QNetworkAccessManager(this))
{
    // Set up default configuration
    m_config.endpoint = "http://127.0.0.1:8012/infill";
    m_config.apiKey = "";
    m_config.nPrefix = 256;
    m_config.nSuffix = 64;
    m_config.nPredict = 128;
    m_config.stopStrings = QStringList();
    m_config.tMaxPromptMs = 500;
    m_config.tMaxPredictMs = 1000;
    m_config.showInfo = 2;
    m_config.autoFim = true;
    m_config.maxLineSuffix = 8;
    m_config.maxCacheKeys = 250;
    m_config.ringNChunks = 16;
    m_config.ringChunkSize = 64;
    m_config.ringScope = 1024;
    m_config.ringUpdateMs = 1000;
    m_config.keymapTrigger = "<C-F>";
    m_config.keymapAcceptFull = "<Tab>";
    m_config.keymapAcceptLine = "<S-Tab>";
    m_config.keymapAcceptWord = "<C-B>";
    
    connect(m_completionTimer, &QTimer::timeout, this, &LlamaPlugin::checkForCompletion);
}

LlamaPlugin::~LlamaPlugin()
{
    if (m_currentEditor)
        disconnect(m_currentEditor->document(), &QTextDocument::contentsChanged, 
                   this, &LlamaPlugin::handleTextChanged);
}

bool LlamaPlugin::initialize(const QStringList &arguments, QString *errorString)
{
    Q_UNUSED(arguments)
    Q_UNUSED(errorString)
    
    // Connect to editor manager signals
    connect(EditorManager::instance(), &EditorManager::editorOpened,
            this, &LlamaPlugin::handleEditorOpened);
    connect(EditorManager::instance(), &EditorManager::editorClosed,
            this, &LlamaPlugin::handleEditorClosed);
            
    // Load configuration
    loadConfiguration();
    
    // Set up timer for context gathering
    if (m_config.ringNChunks > 0) {
        m_completionTimer->start(m_config.ringUpdateMs);
    }
    
    return true;
}

void LlamaPlugin::extensionsInitialized()
{
    // Initialize any extensions here
}

ExtensionSystem::IPlugin::ShutdownFlag LlamaPlugin::aboutToShutdown()
{
    saveConfiguration();
    return SynchronousShutdown;
}

void LlamaPlugin::handleEditorOpened(Core::IEditor *editor)
{
    if (!editor) return;
    
    m_currentEditor = editor;
    m_currentDocument = editor->document();
    
    // Connect to text changes
    connect(m_currentDocument, &QTextDocument::contentsChanged,
            this, &LlamaPlugin::handleTextChanged);
            
    // Connect to cursor position changes
    connect(qobject_cast<TextEditorWidget*>(editor->widget()), 
            &TextEditorWidget::cursorPositionChanged,
            this, &LlamaPlugin::handleCursorPositionChanged);
}

void LlamaPlugin::handleEditorClosed(Core::IEditor *editor)
{
    if (m_currentEditor == editor) {
        m_currentEditor = nullptr;
        m_currentDocument = nullptr;
        hideCompletionHint();
    }
}

void LlamaPlugin::handleCursorPositionChanged()
{
    if (!m_currentEditor || !m_config.autoFim) return;
    
    // Trigger completion check
    checkForCompletion();
}

void LlamaPlugin::handleTextChanged()
{
    if (!m_currentEditor) return;
    
    // Gather context when text changes
    gatherExtraContext();
}

void LlamaPlugin::checkForCompletion()
{
    if (!m_currentEditor || !m_config.autoFim) return;
    
    TextEditorWidget *editor = qobject_cast<TextEditorWidget*>(m_currentEditor->widget());
    if (!editor) return;
    
    QTextCursor cursor = editor->textCursor();
    int pos_x = cursor.positionInBlock();
    int pos_y = cursor.blockNumber() + 1;
    
    // Check if we're at the end of a line (auto completion trigger)
    QTextBlock block = cursor.block();
    if (pos_x >= block.length() - 1) {
        requestCompletion(pos_x, pos_y, true);
    }
}

void LlamaPlugin::setupConfiguration()
{
    // Create configuration dialog
    LlamaSettings *settingsDialog = new LlamaSettings(m_config, nullptr);
    
    connect(settingsDialog, &LlamaSettings::configChanged, 
            [this](const Config &newConfig) {
        m_config = newConfig;
        saveConfiguration();
        
        if (m_config.ringNChunks > 0 && !m_completionTimer->isActive()) {
            m_completionTimer->start(m_config.ringUpdateMs);
        } else if (m_config.ringNChunks == 0 && m_completionTimer->isActive()) {
            m_completionTimer->stop();
        }
    });
    
    settingsDialog->show();
}

void LlamaPlugin::loadConfiguration()
{
    QSettings settings(QSettings::UserScope, "QtProject", "LlamaPlugin");
    
    m_config.endpoint = settings.value("endpoint", m_config.endpoint).toString();
    m_config.apiKey = settings.value("apiKey", m_config.apiKey).toString();
    m_config.nPrefix = settings.value("nPrefix", m_config.nPrefix).toInt();
    m_config.nSuffix = settings.value("nSuffix", m_config.nSuffix).toInt();
    m_config.nPredict = settings.value("nPredict", m_config.nPredict).toInt();
    m_config.tMaxPromptMs = settings.value("tMaxPromptMs", m_config.tMaxPromptMs).toInt();
    m_config.tMaxPredictMs = settings.value("tMaxPredictMs", m_config.tMaxPredictMs).toInt();
    m_config.showInfo = settings.value("showInfo", m_config.showInfo).toInt();
    m_config.autoFim = settings.value("autoFim", m_config.autoFim).toBool();
    m_config.maxLineSuffix = settings.value("maxLineSuffix", m_config.maxLineSuffix).toInt();
    m_config.maxCacheKeys = settings.value("maxCacheKeys", m_config.maxCacheKeys).toInt();
    m_config.ringNChunks = settings.value("ringNChunks", m_config.ringNChunks).toInt();
    m_config.ringChunkSize = settings.value("ringChunkSize", m_config.ringChunkSize).toInt();
    m_config.ringScope = settings.value("ringScope", m_config.ringScope).toInt();
    m_config.ringUpdateMs = settings.value("ringUpdateMs", m_config.ringUpdateMs).toInt();
    m_config.keymapTrigger = settings.value("keymapTrigger", m_config.keymapTrigger).toString();
    m_config.keymapAcceptFull = settings.value("keymapAcceptFull", m_config.keymapAcceptFull).toString();
    m_config.keymapAcceptLine = settings.value("keymapAcceptLine", m_config.keymapAcceptLine).toString();
    m_config.keymapAcceptWord = settings.value("keymapAcceptWord", m_config.keymapAcceptWord).toString();
    
    // Load stop strings
    QStringList stopStrings;
    int count = settings.beginReadArray("stopStrings");
    for (int i = 0; i < count; ++i) {
        settings.setArrayIndex(i);
        stopStrings << settings.value("string").toString();
    }
    settings.endArray();
    m_config.stopStrings = stopStrings;
}

void LlamaPlugin::saveConfiguration()
{
    QSettings settings(QSettings::UserScope, "QtProject", "LlamaPlugin");
    
    settings.setValue("endpoint", m_config.endpoint);
    settings.setValue("apiKey", m_config.apiKey);
    settings.setValue("nPrefix", m_config.nPrefix);
    settings.setValue("nSuffix", m_config.nSuffix);
    settings.setValue("nPredict", m_config.nPredict);
    settings.setValue("tMaxPromptMs", m_config.tMaxPromptMs);
    settings.setValue("tMaxPredictMs", m_config.tMaxPredictMs);
    settings.setValue("showInfo", m_config.showInfo);
    settings.setValue("autoFim", m_config.autoFim);
    settings.setValue("maxLineSuffix", m_config.maxLineSuffix);
    settings.setValue("maxCacheKeys", m_config.maxCacheKeys);
    settings.setValue("ringNChunks", m_config.ringNChunks);
    settings.setValue("ringChunkSize", m_config.ringChunkSize);
    settings.setValue("ringScope", m_config.ringScope);
    settings.setValue("ringUpdateMs", m_config.ringUpdateMs);
    settings.setValue("keymapTrigger", m_config.keymapTrigger);
    settings.setValue("keymapAcceptFull", m_config.keymapAcceptFull);
    settings.setValue("keymapAcceptLine", m_config.keymapAcceptLine);
    settings.setValue("keymapAcceptWord", m_config.keymapAcceptWord);
    
    // Save stop strings
    settings.beginWriteArray("stopStrings");
    for (int i = 0; i < m_config.stopStrings.size(); ++i) {
        settings.setArrayIndex(i);
        settings.setValue("string", m_config.stopStrings[i]);
    }
    settings.endArray();
}

void LlamaPlugin::requestCompletion(int pos_x, int pos_y, bool isAuto)
{
    if (!m_currentEditor || !m_currentDocument) return;
    
    // Get local context
    QString context = getLocalContext(pos_x, pos_y);
    if (context.isEmpty()) return;
    
    // Check cache first
    QString hash = QCryptographicHash::hash(context.toUtf8(), QCryptographicHash::Sha256).toHex();
    
    if (m_cacheData.contains(hash)) {
        processCompletionResponse(m_cacheData[hash]);
        return;
    }
    
    // Create JSON request
    QJsonObject request;
    request["input_prefix"] = "";
    request["input_suffix"] = "";
    request["input_extra"] = QJsonArray(); // Will be filled with extra context
    
    // Add prompt (middle part)
    QString prefix, middle, suffix;
    QTextBlock block = m_currentDocument->findBlockByNumber(pos_y - 1);
    if (!block.isValid()) return;
    
    QString line = block.text();
    prefix = "";
    middle = line.left(pos_x);
    suffix = line.mid(pos_x);
    
    request["prompt"] = middle;
    request["n_predict"] = m_config.nPredict;
    request["stop"] = QJsonArray::fromStringList(m_config.stopStrings);
    request["top_k"] = 40;
    request["top_p"] = 0.9;
    request["stream"] = false;
    request["samplers"] = QStringList{"top_k", "top_p", "infill"};
    request["cache_prompt"] = true;
    request["t_max_prompt_ms"] = m_config.tMaxPromptMs;
    request["t_max_predict_ms"] = isAuto ? 250 : m_config.tMaxPredictMs; // Faster for auto completion
    
    // Add extra context
    QJsonArray extraContext;
    for (const Chunk &chunk : m_ringChunks) {
        QJsonObject chunkObj;
        chunkObj["text"] = chunk.data;
        chunkObj["time"] = chunk.time.toString(Qt::ISODate);
        chunkObj["filename"] = chunk.filename;
        extraContext.append(chunkObj);
    }
    request["input_extra"] = extraContext;
    
    // Create JSON document
    QJsonDocument doc(request);
    QByteArray jsonData = doc.toJson(QJsonDocument::Compact);
    
    // Send request
    QNetworkRequest req(QUrl(m_config.endpoint));
    req.setHeader(QNetworkRequest::ContentTypeHeader, "application/json");
    if (!m_config.apiKey.isEmpty()) {
        req.setRawHeader("Authorization", "Bearer " + m_config.apiKey.toUtf8());
    }
    
    QNetworkReply *reply = m_networkManager->post(req, jsonData);
    
    // Connect to response
    connect(reply, &QNetworkReply::finished, 
            [this, reply, hash]() {
        if (reply->error() == QNetworkReply::NoError) {
            QByteArray data = reply->readAll();
            QString response = QString::fromUtf8(data);
            
            // Cache the result
            m_cacheData[hash] = response;
            
            // Limit cache size
            if (m_cacheData.size() > m_config.maxCacheKeys) {
                auto it = m_cacheData.begin();
                m_cacheData.erase(it);
            }
            
            processCompletionResponse(response);
        } else {
            qDebug() << "Error fetching completion:" << reply->errorString();
        }
        reply->deleteLater();
    });
}

void LlamaPlugin::processCompletionResponse(const QString &response)
{
    // Parse JSON response
    QJsonParseError error;
    QJsonDocument doc = QJsonDocument::fromJson(response.toUtf8(), &error);
    
    if (error.error != QJsonParseError::NoError) {
        qDebug() << "JSON parse error:" << error.errorString();
        return;
    }
    
    QJsonObject obj = doc.object();
    QString content = obj.value("content").toString();
    
    if (!content.isEmpty()) {
        // Get cursor position for hint
        TextEditorWidget *editor = qobject_cast<TextEditorWidget*>(m_currentEditor->widget());
        if (editor) {
            QTextCursor cursor = editor->textCursor();
            int pos_x = cursor.positionInBlock();
            int pos_y = cursor.blockNumber() + 1;
            
            showCompletionHint(content, pos_x, pos_y);
        }
    }
}

void LlamaPlugin::showCompletionHint(const QString &content, int pos_x, int pos_y)
{
    // In Qt Creator, we can't directly add virtual text like in Vim.
    // Instead, we'll show a tooltip or use status bar
    
    if (m_config.showInfo > 0) {
        // Show in status bar
        QString info = "LLaMA: Suggestion ready";
        if (m_config.showInfo == 2) {
            // Add detailed info when needed
        }
        
        // In a real implementation, we would show the content in an overlay
        // or use a custom widget to display the suggestion
    }
    
    m_hintShown = true;
}

void LlamaPlugin::hideCompletionHint()
{
    m_hintShown = false;
}

QString LlamaPlugin::getLocalContext(int pos_x, int pos_y, const QString &prev)
{
    if (!m_currentDocument) return QString();
    
    QTextBlock block = m_currentDocument->findBlockByNumber(pos_y - 1);
    if (!block.isValid()) return QString();
    
    QString lineCur = block.text();
    QString lineCurPrefix = lineCur.left(pos_x);
    QString lineCurSuffix = lineCur.mid(pos_x);
    
    // Get prefix lines
    QStringList linesPrefix;
    int startLine = qMax(1, pos_y - m_config.nPrefix);
    for (int i = startLine; i < pos_y; ++i) {
        QTextBlock b = m_currentDocument->findBlockByNumber(i - 1);
        if (b.isValid()) {
            linesPrefix.append(b.text());
        }
    }
    
    // Get suffix lines
    QStringList linesSuffix;
    int endLine = qMin(m_currentDocument->lineCount(), pos_y + m_config.nSuffix);
    for (int i = pos_y + 1; i <= endLine; ++i) {
        QTextBlock b = m_currentDocument->findBlockByNumber(i - 1);
        if (b.isValid()) {
            linesSuffix.append(b.text());
        }
    }
    
    QString prefix = linesPrefix.join("\n") + "\n";
    QString middle = lineCurPrefix;
    QString suffix = lineCurSuffix + "\n" + linesSuffix.join("\n") + "\n";
    
    return prefix + middle + "Î" + suffix;
}

void LlamaPlugin::gatherExtraContext()
{
    if (m_config.ringNChunks <= 0 || !m_currentDocument) return;
    
    // Gather context around current position
    TextEditorWidget *editor = qobject_cast<TextEditorWidget*>(m_currentEditor->widget());
    if (!editor) return;
    
    QTextCursor cursor = editor->textCursor();
    int pos_y = cursor.blockNumber() + 1;
    
    // Get lines around cursor
    int startLine = qMax(1, pos_y - m_config.ringChunkSize/2);
    int endLine = qMin(m_currentDocument->lineCount(), pos_y + m_config.ringChunkSize/2);
    
    QStringList lines;
    for (int i = startLine; i <= endLine; ++i) {
        QTextBlock block = m_currentDocument->findBlockByNumber(i - 1);
        if (block.isValid()) {
            lines.append(block.text());
        }
    }
    
    if (lines.size() < 3) return;
    
    // Pick a random chunk
    int l0 = qrand() % (lines.size() - m_config.ringChunkSize/2);
    int l1 = qMin(l0 + m_config.ringChunkSize/2, lines.size());
    
    QStringList chunk = lines.mid(l0, l1 - l0);
    QString chunkStr = chunk.join("\n") + "\n";
    
    // Check if already exists
    for (const Chunk &c : m_ringChunks) {
        if (c.data == chunkStr) return;
    }
    for (const Chunk &c : m_ringQueued) {
        if (c.data == chunkStr) return;
    }
    
    // Evict similar chunks
    for (int i = m_ringQueued.size() - 1; i >= 0; --i) {
        const Chunk &queued = m_ringQueued[i];
        if (queued.data.contains(chunkStr) || chunkStr.contains(queued.data)) {
            m_ringQueued.removeAt(i);
            m_ringNEvict++;
        }
    }
    
    // Add to queued
    Chunk newChunk;
    newChunk.data = chunkStr;
    newChunk.time = QDateTime::currentDateTime();
    newChunk.filename = m_currentEditor->document()->fileName();
    m_ringQueued.append(newChunk);
    
    if (m_ringQueued.size() > 16) {
        m_ringQueued.removeFirst();
    }
}

void LlamaPlugin::processExtraContext()
{
    if (m_ringQueued.isEmpty()) return;
    
    // Move first queued chunk to ring buffer
    if (m_ringChunks.size() >= m_config.ringNChunks) {
        m_ringChunks.removeFirst();
    }
    
    Chunk chunk = m_ringQueued.takeFirst();
    m_ringChunks.append(chunk);
    
    // Send request to update context on server
    QJsonObject request;
    request["input_prefix"] = "";
    request["input_suffix"] = "";
    request["prompt"] = "";
    request["n_predict"] = 0;
    request["temperature"] = 0.0;
    request["stream"] = false;
    request["samplers"] = QStringList();
    request["cache_prompt"] = true;
    request["t_max_prompt_ms"] = 1;
    request["t_max_predict_ms"] = 1;
    
    // Add extra context
    QJsonArray extraContext;
    for (const Chunk &c : m_ringChunks) {
        QJsonObject chunkObj;
        chunkObj["text"] = c.data;
        chunkObj["time"] = c.time.toString(Qt::ISODate);
        chunkObj["filename"] = c.filename;
        extraContext.append(chunkObj);
    }
    request["input_extra"] = extraContext;
    
    // Create JSON document
    QJsonDocument doc(request);
    QByteArray jsonData = doc.toJson(QJsonDocument::Compact);
    
    // Send request to server
    QNetworkRequest req(QUrl(m_config.endpoint));
    req.setHeader(QNetworkRequest::ContentTypeHeader, "application/json");
    if (!m_config.apiKey.isEmpty()) {
        req.setRawHeader("Authorization", "Bearer " + m_config.apiKey.toUtf8());
    }
    
    m_networkManager->post(req, jsonData);
}

} // namespace Internal
} // namespace Llama
// llama_settings.h
#ifndef LLAMA_SETTINGS_H
#define LLAMA_SETTINGS_H

#include <QWidget>
#include <QDialog>
#include <QLineEdit>
#include <QSpinBox>
#include <QCheckBox>
#include <QListWidget>
#include <QPushButton>
#include <QVBoxLayout>
#include <QHBoxLayout>
#include <QGridLayout>

namespace Llama {
namespace Internal {

struct Config {
    QString endpoint;
    QString apiKey;
    int nPrefix;
    int nSuffix;
    int nPredict;
    QStringList stopStrings;
    int tMaxPromptMs;
    int tMaxPredictMs;
    int showInfo;
    bool autoFim;
    int maxLineSuffix;
    int maxCacheKeys;
    int ringNChunks;
    int ringChunkSize;
    int ringScope;
    int ringUpdateMs;
    QString keymapTrigger;
    QString keymapAcceptFull;
    QString keymapAcceptLine;
    QString keymapAcceptWord;
};

class LlamaSettings : public QDialog
{
    Q_OBJECT

public:
    explicit LlamaSettings(const Config &config, QWidget *parent = nullptr);
    
signals:
    void configChanged(const Config &newConfig);

private slots:
    void saveConfiguration();
    void addStopString();
    void removeStopString();

private:
    void setupUI();
    
    // Configuration widgets
    QLineEdit *m_endpointEdit;
    QLineEdit *m_apiKeyEdit;
    QSpinBox *m_nPrefixSpin;
    QSpinBox *m_nSuffixSpin;
    QSpinBox *m_nPredictSpin;
    QListWidget *m_stopStringsList;
    QPushButton *m_addStopButton;
    QPushButton *m_removeStopButton;
    QSpinBox *m_tMaxPromptSpin;
    QSpinBox *m_tMaxPredictSpin;
    QSpinBox *m_showInfoSpin;
    QCheckBox *m_autoFimCheck;
    QSpinBox *m_maxLineSuffixSpin;
    QSpinBox *m_maxCacheKeysSpin;
    QSpinBox *m_ringNChunksSpin;
    QSpinBox *m_ringChunkSizeSpin;
    QSpinBox *m_ringScopeSpin;
    QSpinBox *m_ringUpdateMsSpin;
    QLineEdit *m_keymapTriggerEdit;
    QLineEdit *m_keymapAcceptFullEdit;
    QLineEdit *m_keymapAcceptLineEdit;
    QLineEdit *m_keymapAcceptWordEdit;
    
    Config m_currentConfig;
};

} // namespace Internal
} // namespace Llama

#endif // LLAMA_SETTINGS_H
// llama_settings.cpp
#include "llama_settings.h"
#include <QLabel>
#include <QGroupBox>
#include <QDialogButtonBox>
#include <QApplication>

namespace Llama {
namespace Internal {

LlamaSettings::LlamaSettings(const Config &config, QWidget *parent)
    : QDialog(parent)
    , m_currentConfig(config)
{
    setupUI();
    setWindowTitle("LLaMA Plugin Settings");
}

void LlamaSettings::setupUI()
{
    QVBoxLayout *mainLayout = new QVBoxLayout(this);
    
    // Main configuration group
    QGroupBox *configGroup = new QGroupBox("Configuration");
    QGridLayout *configLayout = new QGridLayout(configGroup);
    
    configLayout->addWidget(new QLabel("Endpoint:"), 0, 0);
    m_endpointEdit = new QLineEdit(m_currentConfig.endpoint);
    configLayout->addWidget(m_endpointEdit, 0, 1);
    
    configLayout->addWidget(new QLabel("API Key:"), 1, 0);
    m_apiKeyEdit = new QLineEdit(m_currentConfig.apiKey);
    configLayout->addWidget(m_apiKeyEdit, 1, 1);
    
    configLayout->addWidget(new QLabel("Prefix Lines:"), 2, 0);
    m_nPrefixSpin = new QSpinBox();
    m_nPrefixSpin->setRange(1, 10000);
    m_nPrefixSpin->setValue(m_currentConfig.nPrefix);
    configLayout->addWidget(m_nPrefixSpin, 2, 1);
    
    configLayout->addWidget(new QLabel("Suffix Lines:"), 3, 0);
    m_nSuffixSpin = new QSpinBox();
    m_nSuffixSpin->setRange(1, 10000);
    m_nSuffixSpin->setValue(m_currentConfig.nSuffix);
    configLayout->addWidget(m_nSuffixSpin, 3, 1);
    
    configLayout->addWidget(new QLabel("Max Predictions:"), 4, 0);
    m_nPredictSpin = new QSpinBox();
    m_nPredictSpin->setRange(1, 10000);
    m_nPredictSpin->setValue(m_currentConfig.nPredict);
    configLayout->addWidget(m_nPredictSpin, 4, 1);
    
    configLayout->addWidget(new QLabel("Stop Strings:"), 5, 0);
    m_stopStringsList = new QListWidget();
    for (const QString &str : m_currentConfig.stopStrings) {
        m_stopStringsList->addItem(str);
    }
    configLayout->addWidget(m_stopStringsList, 5, 1);
    
    QHBoxLayout *stopButtonsLayout = new QHBoxLayout();
    m_addStopButton = new QPushButton("Add");
    m_removeStopButton = new QPushButton("Remove");
    connect(m_addStopButton, &QPushButton::clicked, this, &LlamaSettings::addStopString);
    connect(m_removeStopButton, &QPushButton::clicked, this, &LlamaSettings::removeStopString);
    stopButtonsLayout->addWidget(m_addStopButton);
    stopButtonsLayout->addWidget(m_removeStopButton);
    configLayout->addLayout(stopButtonsLayout, 6, 1);
    
    configLayout->addWidget(new QLabel("Max Prompt Time (ms):"), 7, 0);
    m_tMaxPromptSpin = new QSpinBox();
    m_tMaxPromptSpin->setRange(1, 100000);
    m_tMaxPromptSpin->setValue(m_currentConfig.tMaxPromptMs);
    configLayout->addWidget(m_tMaxPromptSpin, 7, 1);
    
    configLayout->addWidget(new QLabel("Max Predict Time (ms):"), 8, 0);
    m_tMaxPredictSpin = new QSpinBox();
    m_tMaxPredictSpin->setRange(1, 100000);
    m_tMaxPredictSpin->setValue(m_currentConfig.tMaxPredictMs);
    configLayout->addWidget(m_tMaxPredictSpin, 8, 1);
    
    configLayout->addWidget(new QLabel("Show Info:"), 9, 0);
    m_showInfoSpin = new QSpinBox();
    m_showInfoSpin->setRange(0, 2);
    m_showInfoSpin->setValue(m_currentConfig.showInfo);
    configLayout->addWidget(m_showInfoSpin, 9, 1);
    
    configLayout->addWidget(new QLabel("Auto FIM:"), 10, 0);
    m_autoFimCheck = new QCheckBox();
    m_autoFimCheck->setChecked(m_currentConfig.autoFim);
    configLayout->addWidget(m_autoFimCheck, 10, 1);
    
    configLayout->addWidget(new QLabel("Max Line Suffix:"), 11, 0);
    m_maxLineSuffixSpin = new QSpinBox();
    m_maxLineSuffixSpin->setRange(1, 1000);
    m_maxLineSuffixSpin->setValue(m_currentConfig.maxLineSuffix);
    configLayout->addWidget(m_maxLineSuffixSpin, 11, 1);
    
    configLayout->addWidget(new QLabel("Max Cache Keys:"), 12, 0);
    m_maxCacheKeysSpin = new QSpinBox();
    m_maxCacheKeysSpin->setRange(1, 10000);
    m_maxCacheKeysSpin->setValue(m_currentConfig.maxCacheKeys);
    configLayout->addWidget(m_maxCacheKeysSpin, 12, 1);
    
    mainLayout->addWidget(configGroup);
    
    // Context gathering group
    QGroupBox *contextGroup = new QGroupBox("Context Gathering");
    QGridLayout *contextLayout = new QGridLayout(contextGroup);
    
    contextLayout->addWidget(new QLabel("Ring Chunks:"), 0, 0);
    m_ringNChunksSpin = new QSpinBox();
    m_ringNChunksSpin->setRange(0, 1000);
    m_ringNChunksSpin->setValue(m_currentConfig.ringNChunks);
    contextLayout->addWidget(m_ringNChunksSpin, 0, 1);
    
    contextLayout->addWidget(new QLabel("Chunk Size:"), 1, 0);
    m_ringChunkSizeSpin = new QSpinBox();
    m_ringChunkSizeSpin->setRange(1, 1000);
    m_ringChunkSizeSpin->setValue(m_currentConfig.ringChunkSize);
    contextLayout->addWidget(m_ringChunkSizeSpin, 1, 1);
    
    contextLayout->addWidget(new QLabel("Ring Scope:"), 2, 0);
    m_ringScopeSpin = new QSpinBox();
    m_ringScopeSpin->setRange(1, 10000);
    m_ringScopeSpin->setValue(m_currentConfig.ringScope);
    contextLayout->addWidget(m_ringScopeSpin, 2, 1);
    
    contextLayout->addWidget(new QLabel("Update Interval (ms):"), 3, 0);
    m_ringUpdateMsSpin = new QSpinBox();
    m_ringUpdateMsSpin->setRange(100, 10000);
    m_ringUpdateMsSpin->setValue(m_currentConfig.ringUpdateMs);
    contextLayout->addWidget(m_ringUpdateMsSpin, 3, 1);
    
    mainLayout->addWidget(contextGroup);
    
    // Keymap group
    QGroupBox *keymapGroup = new QGroupBox("Keymaps");
    QGridLayout *keymapLayout = new QGridLayout(keymapGroup);
    
    keymapLayout->addWidget(new QLabel("Trigger:"), 0, 0);
    m_keymapTriggerEdit = new QLineEdit(m_currentConfig.keymapTrigger);
    keymapLayout->addWidget(m_keymapTriggerEdit, 0, 1);
    
    keymapLayout->addWidget(new QLabel("Accept Full:"), 1, 0);
    m_keymapAcceptFullEdit = new QLineEdit(m_currentConfig.keymapAcceptFull);
    keymapLayout->addWidget(m_keymapAcceptFullEdit, 1, 1);
    
    keymapLayout->addWidget(new QLabel("Accept Line:"), 2, 0);
    m_keymapAcceptLineEdit = new QLineEdit(m_currentConfig.keymapAcceptLine);
    keymapLayout->addWidget(m_keymapAcceptLineEdit, 2, 1);
    
    keymapLayout->addWidget(new QLabel("Accept Word:"), 3, 0);
    m_keymapAcceptWordEdit = new QLineEdit(m_currentConfig.keymapAcceptWord);
    keymapLayout->addWidget(m_keymapAcceptWordEdit, 3, 1);
    
    mainLayout->addWidget(keymapGroup);
    
    // Buttons
    QDialogButtonBox *buttonBox = new QDialogButtonBox(QDialogButtonBox::Save | QDialogButtonBox::Cancel);
    connect(buttonBox, &QDialogButtonBox::accepted, this, &LlamaSettings::saveConfiguration);
    connect(buttonBox, &QDialogButtonBox::rejected, this, &QDialog::reject);
    mainLayout->addWidget(buttonBox);
    
    setLayout(mainLayout);
}

void LlamaSettings::saveConfiguration()
{
    m_currentConfig.endpoint = m_endpointEdit->text();
    m_currentConfig.apiKey = m_apiKeyEdit->text();
    m_currentConfig.nPrefix = m_nPrefixSpin->value();
    m_currentConfig.nSuffix = m_nSuffixSpin->value();
    m_currentConfig.nPredict = m_nPredictSpin->value();
    
    // Get stop strings
    m_currentConfig.stopStrings.clear();
    for (int i = 0; i < m_stopStringsList->count(); ++i) {
        m_currentConfig.stopStrings << m_stopStringsList->item(i)->text();
    }
    
    m_currentConfig.tMaxPromptMs = m_tMaxPromptSpin->value();
    m_currentConfig.tMaxPredictMs = m_tMaxPredictSpin->value();
    m_currentConfig.showInfo = m_showInfoSpin->value();
    m_currentConfig.autoFim = m_autoFimCheck->isChecked();
    m_currentConfig.maxLineSuffix = m_maxLineSuffixSpin->value();
    m_currentConfig.maxCacheKeys = m_maxCacheKeysSpin->value();
    m_currentConfig.ringNChunks = m_ringNChunksSpin->value();
    m_currentConfig.ringChunkSize = m_ringChunkSizeSpin->value();
    m_currentConfig.ringScope = m_ringScopeSpin->value();
    m_currentConfig.ringUpdateMs = m_ringUpdateMsSpin->value();
    m_currentConfig.keymapTrigger = m_keymapTriggerEdit->text();
    m_currentConfig.keymapAcceptFull = m_keymapAcceptFullEdit->text();
    m_currentConfig.keymapAcceptLine = m_keymapAcceptLineEdit->text();
    m_currentConfig.keymapAcceptWord = m_keymapAcceptWordEdit->text();
    
    emit configChanged(m_currentConfig);
    accept();
}

void LlamaSettings::addStopString()
{
    QString text = QInputDialog::getText(this, "Add Stop String", "Enter stop string:");
    if (!text.isEmpty()) {
        m_stopStringsList->addItem(text);
    }
}

void LlamaSettings::removeStopString()
{
    QList<QListWidgetItem*> selectedItems = m_stopStringsList->selectedItems();
    for (QListWidgetItem *item : selectedItems) {
        delete item;
    }
}

} // namespace Internal
} // namespace Llama
// LlamaPlugin.json
{
    "IID": "org.qt-project.Qt.QtCreatorPlugin",
    "ClassName": "Llama::Internal::LlamaPlugin",
    "Name": "LLaMA Infill Plugin",
    "Version": "1.0.0",
    "Vendor": "Qt Project",
    "Description": "LLaMA Infill completion plugin for Qt Creator",
    "License": "GNU General Public License v3.0",
    "Dependencies": [
        {
            "Name": "Core",
            "Version": "1.0"
        },
        {
            "Name": "TextEditor",
            "Version": "1.0"
        }
    ]
}

Key Features Implemented

This Qt Creator plugin provides:

  1. Auto-completion with FIM (Fill-In-the-Middle):
  2. Context Management:
  3. Configuration System:
  4. Caching Mechanism:
  5. Network Integration:
  6. UI Integration:

Usage Instructions

  1. Install the plugin in Qt Creator
  2. Configure settings through “Tools” → “Options” → “LLaMA Plugin”
  3. Set up your llama.cpp server endpoint 4 Start editing code and get automatic suggestions
  4. Use configured keymaps to accept completions:

The plugin provides a similar experience to the vim version but adapted for Qt Creator’s architecture and UI framework.