Skip to content

Commit

Permalink
feat: added new bib parser in lua
Browse files Browse the repository at this point in the history
This is similar to the Vimscript parser ("vim"), but since it is in Lua
it is much faster and comparable to the current fastest parser
("bibtex").

refer: #2786
  • Loading branch information
lervag committed Oct 25, 2023
1 parent 528aee7 commit de91b7b
Show file tree
Hide file tree
Showing 15 changed files with 358 additions and 53 deletions.
13 changes: 13 additions & 0 deletions .luarc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"$schema": "https://raw.githubusercontent.com/LuaLS/vscode-lua/master/setting/schema.json",
"runtime": {
"version": "LuaJIT"
},
"workspace": {
"library": [
"$VIMRUNTIME",
"${3rd}/luv/library"
],
"checkThirdParty": false
}
}
5 changes: 5 additions & 0 deletions .stylua.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
column_width = 80
indent_type = "Spaces"
indent_width = 2
quote_style = "AutoPreferDouble"
call_parentheses = "None"
11 changes: 7 additions & 4 deletions autoload/vimtex/context/cite.vim
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ function! s:handler.get_actions() abort dict " {{{1
call vimtex#paths#pushd(b:vimtex.root)
let l:entries = []
for l:file in vimtex#bib#files()
let l:entries += vimtex#parser#bib(l:file, {'backend': 'vim'})
let l:entries += vimtex#parser#bib(
\ l:file,
\ {'backend': has('nvim') ? 'lua' : 'vim'}
\)
endfor
call vimtex#paths#popd()

Expand Down Expand Up @@ -119,7 +122,7 @@ function! s:actions.show() abort dict " {{{1
\ ['Normal', ','],
\])

for l:x in ['key', 'type', 'vimtex_lnum', 'vimtex_file']
for l:x in ['key', 'type', 'source_lnum', 'source_file']
if has_key(l:entry, l:x)
call remove(l:entry, l:x)
endif
Expand All @@ -144,10 +147,10 @@ endfunction

" }}}1
function! s:actions.edit() abort dict " {{{1
execute 'edit' self.entry.vimtex_file
execute 'edit' self.entry.source_file
filetype detect

call vimtex#pos#set_cursor(self.entry.vimtex_lnum, 0)
call vimtex#pos#set_cursor(self.entry.source_lnum, 0)
normal! zv
endfunction

Expand Down
4 changes: 3 additions & 1 deletion autoload/vimtex/options.vim
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,9 @@ function! vimtex#options#init() abort " {{{1
call s:init_option('vimtex_lint_chktex_ignore_warnings',
\ '-n1 -n3 -n8 -n25 -n36')

call s:init_option('vimtex_parser_bib_backend', 'bibtex')
call s:init_option('vimtex_parser_bib_backend',
\ has('nvim') ? 'lua' : 'bibtex'
\)
call s:init_option('vimtex_parser_cmd_separator_check',
\ 'vimtex#cmd#parser_separator_check')

Expand Down
18 changes: 15 additions & 3 deletions autoload/vimtex/parser/bib.vim
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,18 @@ endfunction

" }}}1

function! s:parse_with_lua(file) abort " {{{1
if !has('nvim')
call vimtex#log#error(
\ 'bib parser backend "lua" only works with neovim!')
return []
endif

return luaeval('require("vimtex.bibparser").parse(_A)', a:file)
endfunction

" }}}1

function! s:parse_with_vim(file) abort " {{{1
" Adheres to the format description found here:
" http://www.bibtex.org/Format/
Expand Down Expand Up @@ -297,8 +309,8 @@ function! s:parse_type(file, lnum, line, current, strings, entries) abort " {{{1

let a:current.level = 1
let a:current.body = ''
let a:current.vimtex_file = a:file
let a:current.vimtex_lnum = a:lnum
let a:current.source_file = a:file
let a:current.source_lnum = a:lnum

if l:type ==# 'string'
return s:parse_string(l:matches[2], a:current, a:strings)
Expand Down Expand Up @@ -420,7 +432,7 @@ function! s:get_value_string(body, head, strings) abort " {{{1
elseif a:body[a:head] ==# '"'
let l:index = match(a:body, '\\\@<!"', a:head+1)
if l:index < 0
return ['s:get_value_string failed', '']
return ['s:get_value_string failed', -1]
endif

let l:value = a:body[a:head+1:l:index-1]
Expand Down
18 changes: 12 additions & 6 deletions doc/vimtex.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1310,8 +1310,14 @@ OPTIONS *vimtex-options*
This option sets the desired default backend for parsing bibliographies.
This is used e.g. for gathering completion candidates. Possible values:

`bibtex`: The fastest, but most hacky solution. Should work well in most
cases.
`bibtex`: The fastest, but most "hacky" solution. Still, time has proved
that this works well!

`vim`: The slowest but perhaps most robust solution, as it does not
require any external utilities.

`lua`: A Lua implementation of the Vim backend. About as fast as the
`bibtex` parser, but this only works on Neovim.

`bibparse`: Also fast, but might be more robust.

Expand All @@ -1335,17 +1341,17 @@ OPTIONS *vimtex-options*
(see |if_pyth| and |py3|) and that the `bibtexparser`
Python module is installed and available.

`vim`: The slowest but perhaps most robust solution, as it does not
require any external utilities.

Some people may want to conditionally change this option if a backend is
available. For example: >vim

if executable('bibparse')
let g:vimtex_parser_bib_backend = 'bibparse'
endif
<
Default value: `bibtex`
Default value:

Vim: `bibtex`
Neovim: `lua`

*g:vimtex_parser_cmd_separator_check*
This option specifies the policy for deciding whether successive groups of
Expand Down
235 changes: 235 additions & 0 deletions lua/vimtex/bibparser.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
-- VimTeX - LaTeX plugin for Vim
--
-- Maintainer: Karl Yngve Lervåg
-- Email: [email protected]
--

---Parse input line as middle or tail part of an entry
---@param item table The current entry
---@param line string The new line to parse
---@return table item Current entry with updated body
local function parse_tail(item, line)
item.level = item.level
+ line:gsub("[^{]", ""):len()
- line:gsub("[^}]", ""):len()
if item.level > 0 then
item.body = item.body .. line
else
item.body = item.body .. vim.fn.matchstr(line, [[.*\ze}]])

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

there's also vim.regex, if you reuse the same pattern multiple times.

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

Does vim.regex have any benefits except convenience?

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

Here probably not; it might be in a tight loop if you can hoist vim.regex outside the loop and just use the result to :match inside.

vim.fn is not to be avoided per se, but it can be a bottleneck if it's called often or on large data.

(Hence the caveat to benchmark.)

This comment has been minimized.

Copy link
@lervag

lervag Oct 26, 2023

Author Owner

Cool, thanks for the info!

item.parsed = true
end

return item
end

---Parse the head part of an entry
---@param file string The path to the bibtex file-asd
---@param lnum integer The line number for the entry
---@param line string The line content of the entry
---@return table item Current entry with updated body
local function parse_head(file, lnum, line)
local matches = vim.fn.matchlist(line, [[\v^\@(\w+)\s*\{\s*(.*)]])
if #matches == 0 then
return {}
end

local type = string.lower(matches[2])
if type == "preamble" or type == "comment" then
return {}
end

return parse_tail({
level = 1,
body = "",
source_file = file,
source_lnum = lnum,
type = type,
}, matches[3])
end

---Parse the value part of a bib entry tag until separating comma or end.
---The value is likely a quoted string and may possibly be a concatenation of
---strings. The value may also contain abbreviations defined by @string
---entries.
---@param body string
---@param head integer
---@param strings table<string, string>
---@param pre_value string
---@return string value The parsed value
---@return integer head New head position
local function get_tag_value_concat(body, head, strings, pre_value)
local value = ""
local new_head = head

if body:sub(head + 1, head + 1) == "{" then
local sum = 1
local i = head + 1
local n = #body

while sum > 0 and i <= n do
local char = body:sub(i + 1, i + 1)
if char == "{" then
sum = sum + 1
elseif char == "}" then
sum = sum - 1
end

i = i + 1
end

value = body:sub(head + 2, i - 1)
new_head = vim.fn.matchend(body, [[^\s*]], i)
elseif body:sub(head + 1, head + 1) == [["]] then
local index = vim.fn.match(body, [[\\\@<!"]], head + 1)
if index < 0 then
return "bibparser.lua: get_tag_value_concat failed", -1
end

value = body:sub(head + 1 + 1, index - 1 + 1)
new_head = vim.fn.matchend(body, [[^\s*]], index + 1)
elseif vim.fn.match(body, [[^\w]], head) >= 0 then
value = vim.fn.matchstr(body, [[^\w[0-9a-zA-Z_-]*]], head)
new_head = vim.fn.matchend(body, [[^\s*]], head + vim.fn.strlen(value))
value = vim.fn.get(strings, value, [[@(]] .. value .. [[)]])
end

if body:sub(new_head + 1, new_head + 1) == "#" then
new_head = vim.fn.matchend(body, [[^\s*]], new_head + 1)
return get_tag_value_concat(body, new_head, strings, pre_value .. value)
end

return pre_value .. value, vim.fn.matchend(body, [[^,\s*]], new_head)
end

---Parse the value part of a bib entry tag until separating comma or end.
---@param body string
---@param head integer
---@param strings table<string, string>
---@return string value The parsed value
---@return integer head New head position
local function get_tag_value(body, head, strings)
-- First check if the value is simply a number
if vim.regex([[\d]]):match_str(body:sub(head + 1, head + 1)) then
local value = vim.fn.matchstr(body, [[^\d\+]], head)
local new_head =
vim.fn.matchend(body, [[^\s*,\s*]], head + vim.fn.len(value))
return value, new_head
end

return get_tag_value_concat(body, head, strings, "")
end

---Parse tag from string (e.g. author, title, etc)
---@param body string Raw text in which to find tag
---@param head integer Where to start search for tag
---@return string tag_name The parsed tag
---@return integer head New head position
local function get_tag_name(body, head)
local matches = vim.fn.matchlist(body, [[^\v([-_:0-9a-zA-Z]+)\s*\=\s*]], head)
if #matches == 0 then
return "", -1
end

return string.lower(matches[2]), head + vim.fn.strlen(matches[1])
end

---Parse an item
---@param item table
---@param strings table<string, string>
---@return nil
local function parse_item(item, strings)
local parts = vim.fn.matchlist(item.body, [[\v^([^, ]*)\s*,\s*(.*)]])

item.key = parts[2]
if item.key == nil or item.key == "" then
return nil
end

item.level = nil
item.parsed = nil
item.body = nil

local body = parts[3]
local tag = ""
local value
local head = 0
while head >= 0 do
if tag == "" then
tag, head = get_tag_name(body, head)
else
value, head = get_tag_value(body, head, strings)
item[tag] = value
tag = ""
end
end

return item
end

---Parse a string entry
---@param raw_string string
---@return string key
---@return string value
local function parse_string(raw_string)
local matches =
vim.fn.matchlist(raw_string, [[\v^\s*(\S+)\s*\=\s*"(.*)"\s*$]])
if vim.fn.empty(matches[3]) == 0 then

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor
if vim.tbl_isempty(matches[3]) then

or

if #matches[3] > 0 then

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

matches[3] should be a string, but it may also be nil. So perhaps it is better with matches[3] ~= nil and matches[3] ~= "" or something like that?

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

matches[3] and #matches[3]>0 is the standard idiom.

return matches[2], matches[3]
end

matches = vim.fn.matchlist(raw_string, [[\v^\s*(\S+)\s*\=\s*\{(.*)\}\s*$]])
if vim.fn.empty(matches[3]) == 0 then
return matches[2], matches[3]
end

return "", ""
end

local M = {}

---Parse the specified bibtex file
---The parser adheres to the format description found here:
---http://www.bibtex.org/Format/
---@param file string
---@return table[]
M.parse = function(file)
if file == nil or not vim.fn.filereadable(file) then
return {}
end

local items = {}
local strings = {}

local item = {}
local key, value
local lines = vim.fn.readfile(file)

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

First lua file 🥳

Some possible micro-optimizations:

function M.read_file(filename)
  local file = assert(io.open(filename, 'r'))
  local r = file:read('*a')
  file:close()
  return r
end

(Bypasses type conversion from vimscript to Lua)

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

Thanks for the feedback! I'm still not so experienced in Lua, so any good suggestions are very welcome here!

I don't think this .read_file() works the same as the vim.fn.readfile() - it seems r here is not a list of lines..?

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

ah, you might want to vim.split(..., '\n') the result then. (The point is to try to avoid conversions for large arrays; as people tend to love single huge "database" bibfiles, this might pay off here. Or not -- one would have to benchmark this and the other suggestions.)

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

By the way: It would be nice to learn how to do some simple profiling here. I believe the slowest part is the part near line 69 where I'm matching braces to find the closing brace position in the string body. But I'm not sure if it can be much faster than it is.

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

Also, one would think there should be another file reader for Lua if the conversion is suboptimal as you say? Something like :help uv_fs_t?

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

By the way: It would be nice to learn how to do some simple profiling here.

Profiling in Lua is hard; I tend to just do A/B benchmarking. If you have LuaJIT, you can use the builtin sampling profiler (see https://github.com/nvim-lua/plenary.nvim/blob/master/lua/plenary/profile.lua), but this requires a bit of parameter tuning to get something useful.

Also, one would think there should be another file reader for Lua if the conversion is suboptimal as you say?

Not to my knowledge. uv is even more low-level. (This snippet is what I use for nvim-treesitter.)

This comment has been minimized.

Copy link
@lervag

lervag Oct 26, 2023

Author Owner

Cool, thanks!

for lnum = 1, #lines do

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

Probably faster as an for lnum, line = ipairs(lines)?

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

At least somewhat cleaner!

local line = lines[lnum]

if vim.tbl_isempty(item) then
item = parse_head(file, lnum, line)
else
item = parse_tail(item, line)
end

if item.parsed then
if item.type == "string" then
key, value = parse_string(item.body)
if key ~= "" then
strings[key] = value
end
else
table.insert(items, item)

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor
items[#items+1] = item

(Faster if you know that items is an array (indexed by consecutive natural numbers) instead of a key-value dictionary.)

This comment has been minimized.

Copy link
@lervag

lervag Oct 25, 2023

Author Owner

Cool, but it does not seem to be very significant here.

This comment has been minimized.

Copy link
@clason

clason Oct 25, 2023

Contributor

No, and table.insert may be more readable -- just a general heads-up. (table.insert is optimized for single injections at arbitrary positions -- so it recreates the array every time; if you insert multiple entries in a row, the performance difference is more noticeable.)

end
item = {}
end
end

local result = {}
for _, x in ipairs(items) do
table.insert(result, parse_item(x, strings))
end
return result
end

return M
3 changes: 2 additions & 1 deletion test/test-completion-bibtex-speed/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
MYVIM ?= nvim --clean --headless

test:
@INMAKE=1 $(MYVIM) -u bibspeed.vim
@INMAKE=1 BACKEND=bibtex $(MYVIM) -u bibspeed.vim
@INMAKE=1 BACKEND=vim $(MYVIM) -u bibspeed.vim
@INMAKE=1 BACKEND=lua $(MYVIM) -u bibspeed.vim
@#INMAKE=1 BACKEND=bibparse $(MYVIM) -u bibspeed.vim
@#INMAKE=1 BACKEND=bibtexparser $(MYVIM) -u bibspeed.vim
@rm -f nvim_servernames.log
Loading

0 comments on commit de91b7b

Please sign in to comment.