feat: added new bib parser in lua

This is similar to the Vimscript parser ("vim"), but since it is in Lua it is much faster and comparable to the current fastest parser ("bibtex"). refer: #2786
lervag · Oct 25, 2023 · de91b7b · clason · Oct 25, 2023 · lervag
1 parent 528aee7
commit de91b7b
Show file tree

Hide file tree

Showing 15 changed files with 358 additions and 53 deletions.
diff --git a/.luarc.json b/.luarc.json
@@ -0,0 +1,13 @@
+{
+  "$schema": "https://raw.githubusercontent.com/LuaLS/vscode-lua/master/setting/schema.json",
+  "runtime": {
+    "version": "LuaJIT"
+  },
+  "workspace": {
+    "library": [
+      "$VIMRUNTIME",
+      "${3rd}/luv/library"
+    ],
+    "checkThirdParty": false
+  }
+}
diff --git a/.stylua.toml b/.stylua.toml
@@ -0,0 +1,5 @@
+column_width = 80
+indent_type = "Spaces"
+indent_width = 2
+quote_style = "AutoPreferDouble"
+call_parentheses = "None"
diff --git a/autoload/vimtex/context/cite.vim b/autoload/vimtex/context/cite.vim
@@ -48,7 +48,10 @@ function! s:handler.get_actions() abort dict " {{{1
   call vimtex#paths#pushd(b:vimtex.root)
   let l:entries = []
   for l:file in vimtex#bib#files()
-    let l:entries += vimtex#parser#bib(l:file, {'backend': 'vim'})
+    let l:entries += vimtex#parser#bib(
+          \ l:file,
+          \ {'backend': has('nvim') ? 'lua' : 'vim'}
+          \)
   endfor
   call vimtex#paths#popd()
 
@@ -119,7 +122,7 @@ function! s:actions.show() abort dict " {{{1
         \ ['Normal', ','],
         \])
 
-  for l:x in ['key', 'type', 'vimtex_lnum', 'vimtex_file']
+  for l:x in ['key', 'type', 'source_lnum', 'source_file']
     if has_key(l:entry, l:x)
       call remove(l:entry, l:x)
     endif
@@ -144,10 +147,10 @@ endfunction
 
 " }}}1
 function! s:actions.edit() abort dict " {{{1
-  execute 'edit' self.entry.vimtex_file
+  execute 'edit' self.entry.source_file
   filetype detect
 
-  call vimtex#pos#set_cursor(self.entry.vimtex_lnum, 0)
+  call vimtex#pos#set_cursor(self.entry.source_lnum, 0)
   normal! zv
 endfunction
 

diff --git a/autoload/vimtex/options.vim b/autoload/vimtex/options.vim
@@ -311,7 +311,9 @@ function! vimtex#options#init() abort " {{{1
   call s:init_option('vimtex_lint_chktex_ignore_warnings',
         \ '-n1 -n3 -n8 -n25 -n36')
 
-  call s:init_option('vimtex_parser_bib_backend', 'bibtex')
+  call s:init_option('vimtex_parser_bib_backend',
+        \ has('nvim') ? 'lua' : 'bibtex'
+        \)
   call s:init_option('vimtex_parser_cmd_separator_check',
         \ 'vimtex#cmd#parser_separator_check')
 

diff --git a/autoload/vimtex/parser/bib.vim b/autoload/vimtex/parser/bib.vim
@@ -250,6 +250,18 @@ endfunction
 
 " }}}1
 
+function! s:parse_with_lua(file) abort " {{{1
+  if !has('nvim')
+    call vimtex#log#error(
+          \ 'bib parser backend "lua" only works with neovim!')
+    return []
+  endif
+
+  return luaeval('require("vimtex.bibparser").parse(_A)', a:file)
+endfunction
+
+" }}}1
+
 function! s:parse_with_vim(file) abort " {{{1
   " Adheres to the format description found here:
   " http://www.bibtex.org/Format/
@@ -297,8 +309,8 @@ function! s:parse_type(file, lnum, line, current, strings, entries) abort " {{{1
 
   let a:current.level = 1
   let a:current.body = ''
-  let a:current.vimtex_file = a:file
-  let a:current.vimtex_lnum = a:lnum
+  let a:current.source_file = a:file
+  let a:current.source_lnum = a:lnum
 
   if l:type ==# 'string'
     return s:parse_string(l:matches[2], a:current, a:strings)
@@ -420,7 +432,7 @@ function! s:get_value_string(body, head, strings) abort " {{{1
   elseif a:body[a:head] ==# '"'
     let l:index = match(a:body, '\\\@<!"', a:head+1)
     if l:index < 0
-      return ['s:get_value_string failed', '']
+      return ['s:get_value_string failed', -1]
     endif
 
     let l:value = a:body[a:head+1:l:index-1]

diff --git a/doc/vimtex.txt b/doc/vimtex.txt
@@ -1310,8 +1310,14 @@ OPTIONS                                                        *vimtex-options*
   This option sets the desired default backend for parsing bibliographies.
   This is used e.g. for gathering completion candidates. Possible values:
 
-    `bibtex`:   The fastest, but most hacky solution. Should work well in most
-              cases.
+    `bibtex`:   The fastest, but most "hacky" solution. Still, time has proved
+              that this works well!
+
+    `vim`:      The slowest but perhaps most robust solution, as it does not
+              require any external utilities.
+
+    `lua`:      A Lua implementation of the Vim backend. About as fast as the
+              `bibtex` parser, but this only works on Neovim.
 
     `bibparse`: Also fast, but might be more robust.
 
@@ -1335,17 +1341,17 @@ OPTIONS                                                        *vimtex-options*
                     (see |if_pyth| and |py3|) and that the `bibtexparser`
                     Python module is installed and available.
 
-    `vim`:      The slowest but perhaps most robust solution, as it does not
-              require any external utilities.
-
   Some people may want to conditionally change this option if a backend is
   available. For example: >vim
 
     if executable('bibparse')
       let g:vimtex_parser_bib_backend = 'bibparse'
     endif
 <
-  Default value: `bibtex`
+  Default value:
+
+    Vim:    `bibtex`
+    Neovim: `lua`
 
 *g:vimtex_parser_cmd_separator_check*
   This option specifies the policy for deciding whether successive groups of

diff --git a/lua/vimtex/bibparser.lua b/lua/vimtex/bibparser.lua
@@ -0,0 +1,235 @@
+-- VimTeX - LaTeX plugin for Vim
+--
+-- Maintainer: Karl Yngve Lervåg
+-- Email:      [email protected]
+--
+
+---Parse input line as middle or tail part of an entry
+---@param item table The current entry
+---@param line string The new line to parse
+---@return table item Current entry with updated body
+local function parse_tail(item, line)
+  item.level = item.level
+    + line:gsub("[^{]", ""):len()
+    - line:gsub("[^}]", ""):len()
+  if item.level > 0 then
+    item.body = item.body .. line
+  else
+    item.body = item.body .. vim.fn.matchstr(line, [[.*\ze}]])
+    item.parsed = true
+  end
+
+  return item
+end
+
+---Parse the head part of an entry
+---@param file string The path to the bibtex file-asd
+---@param lnum integer The line number for the entry
+---@param line string The line content of the entry
+---@return table item Current entry with updated body
+local function parse_head(file, lnum, line)
+  local matches = vim.fn.matchlist(line, [[\v^\@(\w+)\s*\{\s*(.*)]])
+  if #matches == 0 then
+    return {}
+  end
+
+  local type = string.lower(matches[2])
+  if type == "preamble" or type == "comment" then
+    return {}
+  end
+
+  return parse_tail({
+    level = 1,
+    body = "",
+    source_file = file,
+    source_lnum = lnum,
+    type = type,
+  }, matches[3])
+end
+
+---Parse the value part of a bib entry tag until separating comma or end.
+---The value is likely a quoted string and may possibly be a concatenation of
+---strings. The value may also contain abbreviations defined by @string
+---entries.
+---@param body string
+---@param head integer
+---@param strings table<string, string>
+---@param pre_value string
+---@return string value The parsed value
+---@return integer head New head position
+local function get_tag_value_concat(body, head, strings, pre_value)
+  local value = ""
+  local new_head = head
+
+  if body:sub(head + 1, head + 1) == "{" then
+    local sum = 1
+    local i = head + 1
+    local n = #body
+
+    while sum > 0 and i <= n do
+      local char = body:sub(i + 1, i + 1)
+      if char == "{" then
+        sum = sum + 1
+      elseif char == "}" then
+        sum = sum - 1
+      end
+
+      i = i + 1
+    end
+
+    value = body:sub(head + 2, i - 1)
+    new_head = vim.fn.matchend(body, [[^\s*]], i)
+  elseif body:sub(head + 1, head + 1) == [["]] then
+    local index = vim.fn.match(body, [[\\\@<!"]], head + 1)
+    if index < 0 then
+      return "bibparser.lua: get_tag_value_concat failed", -1
+    end
+
+    value = body:sub(head + 1 + 1, index - 1 + 1)
+    new_head = vim.fn.matchend(body, [[^\s*]], index + 1)
+  elseif vim.fn.match(body, [[^\w]], head) >= 0 then
+    value = vim.fn.matchstr(body, [[^\w[0-9a-zA-Z_-]*]], head)
+    new_head = vim.fn.matchend(body, [[^\s*]], head + vim.fn.strlen(value))
+    value = vim.fn.get(strings, value, [[@(]] .. value .. [[)]])
+  end
+
+  if body:sub(new_head + 1, new_head + 1) == "#" then
+    new_head = vim.fn.matchend(body, [[^\s*]], new_head + 1)
+    return get_tag_value_concat(body, new_head, strings, pre_value .. value)
+  end
+
+  return pre_value .. value, vim.fn.matchend(body, [[^,\s*]], new_head)
+end
+
+---Parse the value part of a bib entry tag until separating comma or end.
+---@param body string
+---@param head integer
+---@param strings table<string, string>
+---@return string value The parsed value
+---@return integer head New head position
+local function get_tag_value(body, head, strings)
+  -- First check if the value is simply a number
+  if vim.regex([[\d]]):match_str(body:sub(head + 1, head + 1)) then
+    local value = vim.fn.matchstr(body, [[^\d\+]], head)
+    local new_head =
+      vim.fn.matchend(body, [[^\s*,\s*]], head + vim.fn.len(value))
+    return value, new_head
+  end
+
+  return get_tag_value_concat(body, head, strings, "")
+end
+
+---Parse tag from string (e.g. author, title, etc)
+---@param body string Raw text in which to find tag
+---@param head integer Where to start search for tag
+---@return string tag_name The parsed tag
+---@return integer head New head position
+local function get_tag_name(body, head)
+  local matches = vim.fn.matchlist(body, [[^\v([-_:0-9a-zA-Z]+)\s*\=\s*]], head)
+  if #matches == 0 then
+    return "", -1
+  end
+
+  return string.lower(matches[2]), head + vim.fn.strlen(matches[1])
+end
+
+---Parse an item
+---@param item table
+---@param strings table<string, string>
+---@return nil
+local function parse_item(item, strings)
+  local parts = vim.fn.matchlist(item.body, [[\v^([^, ]*)\s*,\s*(.*)]])
+
+  item.key = parts[2]
+  if item.key == nil or item.key == "" then
+    return nil
+  end
+
+  item.level = nil
+  item.parsed = nil
+  item.body = nil
+
+  local body = parts[3]
+  local tag = ""
+  local value
+  local head = 0
+  while head >= 0 do
+    if tag == "" then
+      tag, head = get_tag_name(body, head)
+    else
+      value, head = get_tag_value(body, head, strings)
+      item[tag] = value
+      tag = ""
+    end
+  end
+
+  return item
+end
+
+---Parse a string entry
+---@param raw_string string
+---@return string key
+---@return string value
+local function parse_string(raw_string)
+  local matches =
+    vim.fn.matchlist(raw_string, [[\v^\s*(\S+)\s*\=\s*"(.*)"\s*$]])
+  if vim.fn.empty(matches[3]) == 0 then
+    return matches[2], matches[3]
+  end
+
+  matches = vim.fn.matchlist(raw_string, [[\v^\s*(\S+)\s*\=\s*\{(.*)\}\s*$]])
+  if vim.fn.empty(matches[3]) == 0 then
+    return matches[2], matches[3]
+  end
+
+  return "", ""
+end
+
+local M = {}
+
+---Parse the specified bibtex file
+---The parser adheres to the format description found here:
+---http://www.bibtex.org/Format/
+---@param file string
+---@return table[]
+M.parse = function(file)
+  if file == nil or not vim.fn.filereadable(file) then
+    return {}
+  end
+
+  local items = {}
+  local strings = {}
+
+  local item = {}
+  local key, value
+  local lines = vim.fn.readfile(file)
+  for lnum = 1, #lines do
+    local line = lines[lnum]
+
+    if vim.tbl_isempty(item) then
+      item = parse_head(file, lnum, line)
+    else
+      item = parse_tail(item, line)
+    end
+
+    if item.parsed then
+      if item.type == "string" then
+        key, value = parse_string(item.body)
+        if key ~= "" then
+          strings[key] = value
+        end
+      else
+        table.insert(items, item)
+      end
+      item = {}
+    end
+  end
+
+  local result = {}
+  for _, x in ipairs(items) do
+    table.insert(result, parse_item(x, strings))
+  end
+  return result
+end
+
+return M
diff --git a/test/test-completion-bibtex-speed/Makefile b/test/test-completion-bibtex-speed/Makefile
@@ -3,8 +3,9 @@
 MYVIM ?= nvim --clean --headless
 
 test:
-	@INMAKE=1                      $(MYVIM) -u bibspeed.vim
+	@INMAKE=1 BACKEND=bibtex       $(MYVIM) -u bibspeed.vim
 	@INMAKE=1 BACKEND=vim          $(MYVIM) -u bibspeed.vim
+	@INMAKE=1 BACKEND=lua          $(MYVIM) -u bibspeed.vim
 	@#INMAKE=1 BACKEND=bibparse     $(MYVIM) -u bibspeed.vim
 	@#INMAKE=1 BACKEND=bibtexparser $(MYVIM) -u bibspeed.vim
 	@rm -f nvim_servernames.log