size: 25 KiB

1local InlineParser = require("djot.inline").InlineParser
2local attributes = require("djot.attributes")
3local unpack = unpack or table.unpack
4local find, sub, byte = string.find, string.sub, string.byte
5
6local Container = {}
7
8function Container:new(spec, data)
9 self = spec
10 local contents = {}
11 setmetatable(contents, self)
12 self.__index = self
13 if data then
14 for k,v in pairs(data) do
15 contents[k] = v
16 end
17 end
18 return contents
19end
20
21local function get_list_styles(marker)
22 if marker == "+" or marker == "-" or marker == "*" or marker == ":" then
23 return {marker}
24 elseif find(marker, "^[+*-] %[[Xx ]%]") then
25 return {"X"} -- task list
26 elseif find(marker, "^[(]?%d+[).]") then
27 return {(marker:gsub("%d+","1"))}
28 -- in ambiguous cases we return two values
29 elseif find(marker, "^[(]?[ivxlcdm][).]") then
30 return {(marker:gsub("%a+", "i")), (marker:gsub("%a+", "a"))}
31 elseif find(marker, "^[(]?[IVXLCDM][).]") then
32 return {(marker:gsub("%a+", "I")), (marker:gsub("%a+", "A"))}
33 elseif find(marker, "^[(]?%l[).]") then
34 return {(marker:gsub("%l", "a"))}
35 elseif find(marker, "^[(]?%u[).]") then
36 return {(marker:gsub("%u", "A"))}
37 elseif find(marker, "^[(]?[ivxlcdm]+[).]") then
38 return {(marker:gsub("%a+", "i"))}
39 elseif find(marker, "^[(]?[IVXLCDM]+[).]") then
40 return {(marker:gsub("%a+", "I"))}
41 else -- doesn't match any list style
42 return {}
43 end
44end
45
46---@class Parser
47---@field subject string
48---@field warn function
49---@field matches table
50---@field containers table
51local Parser = {}
52
53function Parser:new(subject, warn)
54 -- ensure the subject ends with a newline character
55 if not subject:find("[\r\n]$") then
56 subject = subject .. "\n"
57 end
58 local state = {
59 warn = warn or function() end,
60 subject = subject,
61 indent = 0,
62 startline = nil,
63 starteol = nil,
64 endeol = nil,
65 matches = {},
66 containers = {},
67 pos = 1,
68 last_matched_container = 0,
69 timer = {},
70 finished_line = false,
71 returned = 0 }
72 setmetatable(state, self)
73 self.__index = self
74 return state
75end
76
77-- parameters are start and end position
78function Parser:parse_table_row(sp, ep)
79 local orig_matches = #self.matches -- so we can rewind
80 local startpos = self.pos
81 self:add_match(sp, sp, "+row")
82 -- skip | and any initial space in the cell:
83 self.pos = find(self.subject, "%S", sp + 1)
84 -- check to see if we have a separator line
85 local seps = {}
86 local p = self.pos
87 local sepfound = false
88 while not sepfound do
89 local sepsp, sepep, left, right, trailing =
90 find(self.subject, "^(%:?)%-%-*(%:?)([ \t]*%|[ \t]*)", p)
91 if sepep then
92 local st = "separator_default"
93 if #left > 0 and #right > 0 then
94 st = "separator_center"
95 elseif #right > 0 then
96 st = "separator_right"
97 elseif #left > 0 then
98 st = "separator_left"
99 end
100 seps[#seps + 1] = {sepsp, sepep - #trailing, st}
101 p = sepep + 1
102 if p == self.starteol then
103 sepfound = true
104 break
105 end
106 else
107 break
108 end
109 end
110 if sepfound then
111 for i=1,#seps do
112 self:add_match(unpack(seps[i]))
113 end
114 self:add_match(self.starteol - 1, self.starteol - 1, "-row")
115 self.pos = self.starteol
116 self.finished_line = true
117 return true
118 end
119 local inline_parser = InlineParser:new(self.subject, self.warn)
120 self:add_match(sp, sp, "+cell")
121 local complete_cell = false
122 while self.pos <= ep do
123 -- parse a chunk as inline content
124 local nextbar, _
125 while not nextbar do
126 _, nextbar = self:find("^[^|\r\n]*|")
127 if not nextbar then
128 break
129 end
130 if string.find(self.subject, "^\\", nextbar - 1) then -- \|
131 inline_parser:feed(self.pos, nextbar)
132 self.pos = nextbar + 1
133 nextbar = nil
134 else
135 inline_parser:feed(self.pos, nextbar - 1)
136 if inline_parser:in_verbatim() then
137 inline_parser:feed(nextbar, nextbar)
138 self.pos = nextbar + 1
139 nextbar = nil
140 else
141 self.pos = nextbar + 1
142 end
143 end
144 end
145 complete_cell = nextbar
146 if not complete_cell then
147 break
148 end
149 -- add a table cell
150 local cell_matches = inline_parser:get_matches()
151 for i=1,#cell_matches do
152 local s,e,ann = unpack(cell_matches[i])
153 if i == #cell_matches and ann == "str" then
154 -- strip trailing space
155 while byte(self.subject, e) == 32 and e >= s do
156 e = e - 1
157 end
158 end
159 self:add_match(s,e,ann)
160 end
161 self:add_match(nextbar, nextbar, "-cell")
162 if nextbar < ep then
163 -- reset inline parser state
164 inline_parser = InlineParser:new(self.subject, self.warn)
165 self:add_match(nextbar, nextbar, "+cell")
166 self.pos = find(self.subject, "%S", self.pos)
167 end
168 end
169 if not complete_cell then
170 -- rewind, this is not a valid table row
171 self.pos = startpos
172 for i = orig_matches,#self.matches do
173 self.matches[i] = nil
174 end
175 return false
176 else
177 self:add_match(self.pos, self.pos, "-row")
178 self.pos = self.starteol
179 self.finished_line = true
180 return true
181 end
182end
183
184function Parser:specs()
185 return {
186 { name = "para",
187 is_para = true,
188 content = "inline",
189 continue = function()
190 if self:find("^%S") then
191 return true
192 else
193 return false
194 end
195 end,
196 open = function(spec)
197 self:add_container(Container:new(spec,
198 { inline_parser =
199 InlineParser:new(self.subject, self.warn) }))
200 self:add_match(self.pos, self.pos, "+para")
201 return true
202 end,
203 close = function()
204 self:get_inline_matches()
205 local last = self.matches[#self.matches] or {self.pos, self.pos, ""}
206 local sp, ep, annot = unpack(last)
207 self:add_match(ep + 1, ep + 1, "-para")
208 self.containers[#self.containers] = nil
209 end
210 },
211
212 { name = "caption",
213 is_para = false,
214 content = "inline",
215 continue = function()
216 return self:find("^%S")
217 end,
218 open = function(spec)
219 local _, ep = self:find("^%^[ \t]+")
220 if ep then
221 self.pos = ep + 1
222 self:add_container(Container:new(spec,
223 { inline_parser =
224 InlineParser:new(self.subject, self.warn) }))
225 self:add_match(self.pos, self.pos, "+caption")
226 return true
227 end
228 end,
229 close = function()
230 self:get_inline_matches()
231 self:add_match(self.pos - 1, self.pos - 1, "-caption")
232 self.containers[#self.containers] = nil
233 end
234 },
235
236 { name = "blockquote",
237 content = "block",
238 continue = function()
239 if self:find("^%>%s") then
240 self.pos = self.pos + 1
241 return true
242 else
243 return false
244 end
245 end,
246 open = function(spec)
247 if self:find("^%>%s") then
248 self:add_container(Container:new(spec))
249 self:add_match(self.pos, self.pos, "+blockquote")
250 self.pos = self.pos + 1
251 return true
252 end
253 end,
254 close = function()
255 self:add_match(self.pos, self.pos, "-blockquote")
256 self.containers[#self.containers] = nil
257 end
258 },
259
260 -- should go before reference definitions
261 { name = "footnote",
262 content = "block",
263 continue = function(container)
264 if self.indent > container.indent or self:find("^[\r\n]") then
265 return true
266 else
267 return false
268 end
269 end,
270 open = function(spec)
271 local sp, ep, label = self:find("^%[%^([^]]+)%]:%s")
272 if not sp then
273 return nil
274 end
275 -- adding container will close others
276 self:add_container(Container:new(spec, {note_label = label,
277 indent = self.indent}))
278 self:add_match(sp, sp, "+footnote")
279 self:add_match(sp + 2, ep - 3, "note_label")
280 self.pos = ep
281 return true
282 end,
283 close = function(_container)
284 self:add_match(self.pos, self.pos, "-footnote")
285 self.containers[#self.containers] = nil
286 end
287 },
288
289 -- should go before list_item_spec
290 { name = "thematic_break",
291 content = nil,
292 continue = function()
293 return false
294 end,
295 open = function(spec)
296 local sp, ep = self:find("^[-*][ \t]*[-*][ \t]*[-*][-* \t]*[\r\n]")
297 if ep then
298 self:add_container(Container:new(spec))
299 self:add_match(sp, ep, "thematic_break")
300 self.pos = ep
301 return true
302 end
303 end,
304 close = function(_container)
305 self.containers[#self.containers] = nil
306 end
307 },
308
309 { name = "list_item",
310 content = "block",
311 continue = function(container)
312 if self.indent > container.indent or self:find("^[\r\n]") then
313 return true
314 else
315 return false
316 end
317 end,
318 open = function(spec)
319 local sp, ep = self:find("^[-*+:]%s")
320 if not sp then
321 sp, ep = self:find("^%d+[.)]%s")
322 end
323 if not sp then
324 sp, ep = self:find("^%(%d+%)%s")
325 end
326 if not sp then
327 sp, ep = self:find("^[ivxlcdmIVXLCDM]+[.)]%s")
328 end
329 if not sp then
330 sp, ep = self:find("^%([ivxlcdmIVXLCDM]+%)%s")
331 end
332 if not sp then
333 sp, ep = self:find("^%a[.)]%s")
334 end
335 if not sp then
336 sp, ep = self:find("^%(%a%)%s")
337 end
338 if not sp then
339 return nil
340 end
341 local marker = sub(self.subject, sp, ep - 1)
342 local checkbox = nil
343 if self:find("^[*+-] %[[Xx ]%]%s", sp + 1) then -- task list
344 marker = sub(self.subject, sp, sp + 4)
345 checkbox = sub(self.subject, sp + 3, sp + 3)
346 end
347 -- some items have ambiguous style
348 local styles = get_list_styles(marker)
349 if #styles == 0 then
350 return nil
351 end
352 local data = { styles = styles,
353 indent = self.indent }
354 -- adding container will close others
355 self:add_container(Container:new(spec, data))
356 local annot = "+list_item"
357 for i=1,#styles do
358 annot = annot .. "|" .. styles[i]
359 end
360 self:add_match(sp, ep - 1, annot)
361 self.pos = ep
362 if checkbox then
363 if checkbox == " " then
364 self:add_match(sp + 2, sp + 4, "checkbox_unchecked")
365 else
366 self:add_match(sp + 2, sp + 4, "checkbox_checked")
367 end
368 self.pos = sp + 5
369 end
370 return true
371 end,
372 close = function(_container)
373 self:add_match(self.pos, self.pos, "-list_item")
374 self.containers[#self.containers] = nil
375 end
376 },
377
378 { name = "reference_definition",
379 content = nil,
380 continue = function(container)
381 if container.indent >= self.indent then
382 return false
383 end
384 local _, ep, rest = self:find("^(%S+)")
385 if ep and self.starteol == ep + 1 then
386 self:add_match(ep - #rest + 1, ep, "reference_value")
387 self.pos = ep + 1
388 return true
389 else
390 return false
391 end
392 end,
393 open = function(spec)
394 local sp, ep, label, rest = self:find("^%[([^]\r\n]*)%]:[ \t]*(%S*)")
395 if ep and self.starteol == ep + 1 then
396 self:add_container(Container:new(spec,
397 { key = label,
398 indent = self.indent }))
399 self:add_match(sp, sp, "+reference_definition")
400 self:add_match(sp, sp + #label + 1, "reference_key")
401 if #rest > 0 then
402 self:add_match(ep - #rest + 1, ep, "reference_value")
403 end
404 self.pos = ep + 1
405 return true
406 end
407 end,
408 close = function(_container)
409 self:add_match(self.pos, self.pos, "-reference_definition")
410 self.containers[#self.containers] = nil
411 end
412 },
413
414 { name = "heading",
415 content = "inline",
416 continue = function(container)
417 local sp, ep = self:find("^%#+%s")
418 if sp and ep and container.level == ep - sp then
419 self.pos = ep
420 return true
421 else
422 return false
423 end
424 end,
425 open = function(spec)
426 local sp, ep = self:find("^#+")
427 if ep and find(self.subject, "^%s", ep + 1) then
428 local level = ep - sp + 1
429 self:add_container(Container:new(spec, {level = level,
430 inline_parser = InlineParser:new(self.subject, self.warn) }))
431 self:add_match(sp, ep, "+heading")
432 self.pos = ep + 1
433 return true
434 end
435 end,
436 close = function(_container)
437 self:get_inline_matches()
438 local last = self.matches[#self.matches] or {self.pos, self.pos, ""}
439 local sp, ep, annot = unpack(last)
440 self:add_match(ep + 1, ep + 1, "-heading")
441 self.containers[#self.containers] = nil
442 end
443 },
444
445 { name = "code_block",
446 content = "text",
447 continue = function(container)
448 local char = sub(container.border, 1, 1)
449 local sp, ep, border = self:find("^(" .. container.border ..
450 char .. "*)[ \t]*[\r\n]")
451 if ep then
452 container.end_fence_sp = sp
453 container.end_fence_ep = sp + #border - 1
454 self.pos = ep -- before newline
455 self.finished_line = true
456 return false
457 else
458 return true
459 end
460 end,
461 open = function(spec)
462 local sp, ep, border, ws, lang =
463 self:find("^(~~~~*)([ \t]*)(%S*)[ \t]*[\r\n]")
464 if not ep then
465 sp, ep, border, ws, lang =
466 self:find("^(````*)([ \t]*)([^%s`]*)[ \t]*[\r\n]")
467 end
468 if border then
469 local is_raw = find(lang, "^=") and true or false
470 self:add_container(Container:new(spec, {border = border,
471 indent = self.indent }))
472 self:add_match(sp, sp + #border - 1, "+code_block")
473 if #lang > 0 then
474 local langstart = sp + #border + #ws
475 if is_raw then
476 self:add_match(langstart, langstart + #lang - 1, "raw_format")
477 else
478 self:add_match(langstart, langstart + #lang - 1, "code_language")
479 end
480 end
481 self.pos = ep -- before newline
482 self.finished_line = true
483 return true
484 end
485 end,
486 close = function(container)
487 local sp = container.end_fence_sp or self.pos
488 local ep = container.end_fence_ep or self.pos
489 self:add_match(sp, ep, "-code_block")
490 if sp == ep then
491 self.warn({ pos = self.pos, message = "Unclosed code block" })
492 end
493 self.containers[#self.containers] = nil
494 end
495 },
496
497 { name = "fenced_div",
498 content = "block",
499 continue = function(container)
500 if self.containers[#self.containers].name == "code_block" then
501 return true -- see #109
502 end
503 local sp, ep, equals = self:find("^(::::*)[ \t]*[\r\n]")
504 if ep and #equals >= container.equals then
505 container.end_fence_sp = sp
506 container.end_fence_ep = sp + #equals - 1
507 self.pos = ep -- before newline
508 return false
509 else
510 return true
511 end
512 end,
513 open = function(spec)
514 local sp, ep1, equals = self:find("^(::::*)[ \t]*")
515 if not ep1 then
516 return false
517 end
518 local clsp, ep = find(self.subject, "^[%w_-]*", ep1 + 1)
519 local _, eol = find(self.subject, "^[ \t]*[\r\n]", ep + 1)
520 if eol then
521 self:add_container(Container:new(spec, {equals = #equals}))
522 self:add_match(sp, ep, "+div")
523 if ep >= clsp then
524 self:add_match(clsp, ep, "class")
525 end
526 self.pos = eol + 1
527 self.finished_line = true
528 return true
529 end
530 end,
531 close = function(container)
532 local sp = container.end_fence_sp or self.pos
533 local ep = container.end_fence_ep or self.pos
534 -- check to make sure the match is in order
535 self:add_match(sp, ep, "-div")
536 if sp == ep then
537 self.warn({pos = self.pos, message = "Unclosed div"})
538 end
539 self.containers[#self.containers] = nil
540 end
541 },
542
543 { name = "table",
544 content = "cells",
545 continue = function(_container)
546 local sp, ep = self:find("^|[^\r\n]*|")
547 local eolsp = ep and find(self.subject, "^[ \t]*[\r\n]", ep + 1);
548 if eolsp then
549 return self:parse_table_row(sp, ep)
550 end
551 end,
552 open = function(spec)
553 local sp, ep = self:find("^|[^\r\n]*|")
554 local eolsp = " *[\r\n]" -- make sure at end of line
555 if sp and eolsp then
556 self:add_container(Container:new(spec, { columns = 0 }))
557 self:add_match(sp, sp, "+table")
558 if self:parse_table_row(sp, ep) then
559 return true
560 else
561 self.containers[#self.containers] = nil
562 return false
563 end
564 end
565 end,
566 close = function(_container)
567 self:add_match(self.pos, self.pos, "-table")
568 self.containers[#self.containers] = nil
569 end
570 },
571
572 { name = "attributes",
573 content = "attributes",
574 open = function(spec)
575 if self:find("^%{") then
576 local attribute_parser =
577 attributes.AttributeParser:new(self.subject)
578 local status, ep =
579 attribute_parser:feed(self.pos, self.endeol)
580 if status == 'fail' or ep + 1 < self.endeol then
581 return false
582 else
583 self:add_container(Container:new(spec,
584 { status = status,
585 indent = self.indent,
586 startpos = self.pos,
587 slices = {},
588 attribute_parser = attribute_parser }))
589 local container = self.containers[#self.containers]
590 container.slices = { {self.pos, self.endeol } }
591 self.pos = self.starteol
592 return true
593 end
594
595 end
596 end,
597 continue = function(container)
598 if self.indent > container.indent then
599 table.insert(container.slices, { self.pos, self.endeol })
600 local status, ep =
601 container.attribute_parser:feed(self.pos, self.endeol)
602 container.status = status
603 if status ~= 'fail' or ep + 1 < self.endeol then
604 self.pos = self.starteol
605 return true
606 end
607 end
608 -- if we get to here, we don't continue; either we
609 -- reached the end of indentation or we failed in
610 -- parsing attributes
611 if container.status == 'done' then
612 return false
613 else -- attribute parsing failed; convert to para and continue
614 -- with that
615 local para_spec = self:specs()[1]
616 local para = Container:new(para_spec,
617 { inline_parser =
618 InlineParser:new(self.subject, self.warn) })
619 self:add_match(container.startpos, container.startpos, "+para")
620 self.containers[#self.containers] = para
621 -- reparse the text we couldn't parse as a block attribute:
622 para.inline_parser.attribute_slices = container.slices
623 para.inline_parser:reparse_attributes()
624 self.pos = para.inline_parser.lastpos + 1
625 return true
626 end
627 end,
628 close = function(container)
629 local attr_matches = container.attribute_parser:get_matches()
630 self:add_match(container.startpos, container.startpos, "+block_attributes")
631 for i=1,#attr_matches do
632 self:add_match(unpack(attr_matches[i]))
633 end
634 self:add_match(self.pos, self.pos, "-block_attributes")
635 self.containers[#self.containers] = nil
636 end
637 }
638 }
639end
640
641function Parser:get_inline_matches()
642 local matches =
643 self.containers[#self.containers].inline_parser:get_matches()
644 for i=1,#matches do
645 self.matches[#self.matches + 1] = matches[i]
646 end
647end
648
649function Parser:find(patt)
650 return find(self.subject, patt, self.pos)
651end
652
653function Parser:add_match(startpos, endpos, annotation)
654 self.matches[#self.matches + 1] = {startpos, endpos, annotation}
655end
656
657function Parser:add_container(container)
658 local last_matched = self.last_matched_container
659 while #self.containers > last_matched or
660 (#self.containers > 0 and
661 self.containers[#self.containers].content ~= "block") do
662 self.containers[#self.containers]:close()
663 end
664 self.containers[#self.containers + 1] = container
665end
666
667function Parser:skip_space()
668 local newpos, _ = find(self.subject, "[^ \t]", self.pos)
669 if newpos then
670 self.indent = newpos - self.startline
671 self.pos = newpos
672 end
673end
674
675function Parser:get_eol()
676 local starteol, endeol = find(self.subject, "[\r]?[\n]", self.pos)
677 if not endeol then
678 starteol, endeol = #self.subject, #self.subject
679 end
680 self.starteol = starteol
681 self.endeol = endeol
682end
683
684-- Returns an iterator over events. At each iteration, the iterator
685-- returns three values: start byte position, end byte position,
686-- and annotation.
687function Parser:events()
688 local specs = self:specs()
689 local para_spec = specs[1]
690 local subjectlen = #self.subject
691
692 return function() -- iterator
693
694 while self.pos <= subjectlen do
695
696 -- return any accumulated matches
697 if self.returned < #self.matches then
698 self.returned = self.returned + 1
699 return unpack(self.matches[self.returned])
700 end
701
702 self.indent = 0
703 self.startline = self.pos
704 self.finished_line = false
705 self:get_eol()
706
707 -- check open containers for continuation
708 self.last_matched_container = 0
709 local idx = 0
710 while idx < #self.containers do
711 idx = idx + 1
712 local container = self.containers[idx]
713 -- skip any indentation
714 self:skip_space()
715 if container:continue() then
716 self.last_matched_container = idx
717 else
718 break
719 end
720 end
721
722 -- if we hit a close fence, we can move to next line
723 if self.finished_line then
724 while #self.containers > self.last_matched_container do
725 self.containers[#self.containers]:close()
726 end
727 end
728
729 if not self.finished_line then
730 -- check for new containers
731 self:skip_space()
732 local is_blank = (self.pos == self.starteol)
733
734 local new_starts = false
735 local last_match = self.containers[self.last_matched_container]
736 local check_starts = not is_blank and
737 (not last_match or last_match.content == "block") and
738 not self:find("^%a+%s") -- optimization
739 while check_starts do
740 check_starts = false
741 for i=1,#specs do
742 local spec = specs[i]
743 if not spec.is_para then
744 if spec:open() then
745 self.last_matched_container = #self.containers
746 if self.finished_line then
747 check_starts = false
748 else
749 self:skip_space()
750 new_starts = true
751 check_starts = spec.content == "block"
752 end
753 break
754 end
755 end
756 end
757 end
758
759 if not self.finished_line then
760 -- handle remaining content
761 self:skip_space()
762
763 is_blank = (self.pos == self.starteol)
764
765 local is_lazy = not is_blank and
766 not new_starts and
767 self.last_matched_container < #self.containers and
768 self.containers[#self.containers].content == 'inline'
769
770 local last_matched = self.last_matched_container
771 if not is_lazy then
772 while #self.containers > 0 and #self.containers > last_matched do
773 self.containers[#self.containers]:close()
774 end
775 end
776
777 local tip = self.containers[#self.containers]
778
779 -- add para by default if there's text
780 if not tip or tip.content == 'block' then
781 if is_blank then
782 if not new_starts then
783 -- need to track these for tight/loose lists
784 self:add_match(self.pos, self.endeol, "blankline")
785 end
786 else
787 para_spec:open()
788 end
789 tip = self.containers[#self.containers]
790 end
791
792 if tip then
793 if tip.content == "text" then
794 local startpos = self.pos
795 if tip.indent and self.indent > tip.indent then
796 -- get back the leading spaces we gobbled
797 startpos = startpos - (self.indent - tip.indent)
798 end
799 self:add_match(startpos, self.endeol, "str")
800 elseif tip.content == "inline" then
801 if not is_blank then
802 tip.inline_parser:feed(self.pos, self.endeol)
803 end
804 end
805 end
806 end
807 end
808
809 self.pos = self.endeol + 1
810
811 end
812
813 -- close unmatched containers
814 while #self.containers > 0 do
815 self.containers[#self.containers]:close()
816 end
817 -- return any accumulated matches
818 if self.returned < #self.matches then
819 self.returned = self.returned + 1
820 return unpack(self.matches[self.returned])
821 end
822
823 end
824
825end
826
827return { Parser = Parser,
828 Container = Container }