size: 22 KiB

1-- this allows the code to work with both lua and luajit:
2local unpack = unpack or table.unpack
3local attributes = require("djot.attributes")
4local find, byte = string.find, string.byte
5
6-- allow up to 3 captures...
7local function bounded_find(subj, patt, startpos, endpos)
8 local sp,ep,c1,c2,c3 = find(subj, patt, startpos)
9 if ep and ep <= endpos then
10 return sp,ep,c1,c2,c3
11 end
12end
13
14-- General note on the parsing strategy: our objective is to
15-- parse without backtracking. To that end, we keep a stack of
16-- potential 'openers' for links, images, emphasis, and other
17-- inline containers. When we parse a potential closer for
18-- one of these constructions, we can scan the stack of openers
19-- for a match, which will tell us the location of the potential
20-- opener. We can then change the annotation of the match at
21-- that location to '+emphasis' or whatever.
22
23local InlineParser = {}
24
25function InlineParser:new(subject, warn)
26 local state =
27 { warn = warn or function() end, -- function to issue warnings
28 subject = subject, -- text to parse
29 matches = {}, -- table pos : (endpos, annotation)
30 openers = {}, -- map from closer_type to array of (pos, data) in reverse order
31 verbatim = 0, -- parsing verbatim span to be ended by n backticks
32 verbatim_type = nil, -- whether verbatim is math or regular
33 destination = false, -- parsing link destination in ()
34 firstpos = 0, -- position of first slice
35 lastpos = 0, -- position of last slice
36 allow_attributes = true, -- allow parsing of attributes
37 attribute_parser = nil, -- attribute parser
38 attribute_start = nil, -- start of potential attribute
39 attribute_slices = nil, -- slices we've tried to parse as attributes
40 }
41 setmetatable(state, self)
42 self.__index = self
43 return state
44end
45
46function InlineParser:add_match(startpos, endpos, annotation)
47 self.matches[startpos] = {startpos, endpos, annotation}
48end
49
50function InlineParser:add_opener(name, ...)
51 -- 1 = startpos, 2 = endpos, 3 = annotation, 4 = substartpos, 5 = endpos
52 --
53 -- [link text](url)
54 -- ^ ^^
55 -- 1,2 4 5 3 = "explicit_link"
56
57 if not self.openers[name] then
58 self.openers[name] = {}
59 end
60 table.insert(self.openers[name], {...})
61end
62
63function InlineParser:clear_openers(startpos, endpos)
64 -- remove other openers in between the matches
65 for _,v in pairs(self.openers) do
66 local i = #v
67 while v[i] do
68 local sp,ep,_,sp2,ep2 = unpack(v[i])
69 if sp >= startpos and ep <= endpos then
70 v[i] = nil
71 elseif (sp2 and sp2 >= startpos) and (ep2 and ep2 <= endpos) then
72 v[i][3] = nil
73 v[i][4] = nil
74 v[i][5] = nil
75 else
76 break
77 end
78 i = i - 1
79 end
80 end
81end
82
83function InlineParser:str_matches(startpos, endpos)
84 for i = startpos, endpos do
85 local m = self.matches[i]
86 if m then
87 local sp, ep, annot = unpack(m)
88 if annot ~= "str" and annot ~= "escape" then
89 self.matches[i] = {sp, ep, "str"}
90 end
91 end
92 end
93end
94
95local function matches_pattern(match, patt)
96 if match then
97 return string.find(match[3], patt)
98 end
99end
100
101
102function InlineParser.between_matched(c, annotation, defaultmatch, opentest)
103 return function(self, pos, endpos)
104 defaultmatch = defaultmatch or "str"
105 local subject = self.subject
106 local can_open = find(subject, "^%S", pos + 1)
107 local can_close = find(subject, "^%S", pos - 1)
108 local has_open_marker = matches_pattern(self.matches[pos - 1], "^open%_marker")
109 local has_close_marker = pos + 1 <= endpos and
110 byte(subject, pos + 1) == 125 -- }
111 local endcloser = pos
112 local startopener = pos
113
114 if type(opentest) == "function" then
115 can_open = can_open and opentest(self, pos)
116 end
117
118 -- allow explicit open/close markers to override:
119 if has_open_marker then
120 can_open = true
121 can_close = false
122 startopener = pos - 1
123 end
124 if not has_open_marker and has_close_marker then
125 can_close = true
126 can_open = false
127 endcloser = pos + 1
128 end
129
130 if has_open_marker and defaultmatch:match("^right") then
131 defaultmatch = defaultmatch:gsub("^right", "left")
132 elseif has_close_marker and defaultmatch:match("^left") then
133 defaultmatch = defaultmatch:gsub("^left", "right")
134 end
135
136 local d
137 if has_close_marker then
138 d = "{" .. c
139 else
140 d = c
141 end
142 local openers = self.openers[d]
143 if can_close and openers and #openers > 0 then
144 -- check openers for a match
145 local openpos, openposend = unpack(openers[#openers])
146 if openposend ~= pos - 1 then -- exclude empty emph
147 self:clear_openers(openpos, pos)
148 self:add_match(openpos, openposend, "+" .. annotation)
149 self:add_match(pos, endcloser, "-" .. annotation)
150 return endcloser + 1
151 end
152 end
153
154 -- if we get here, we didn't match an opener
155 if can_open then
156 if has_open_marker then
157 d = "{" .. c
158 else
159 d = c
160 end
161 self:add_opener(d, startopener, pos)
162 self:add_match(startopener, pos, defaultmatch)
163 return pos + 1
164 else
165 self:add_match(pos, endcloser, defaultmatch)
166 return endcloser + 1
167 end
168 end
169end
170
171InlineParser.matchers = {
172 -- 96 = `
173 [96] = function(self, pos, endpos)
174 local subject = self.subject
175 local _, endchar = bounded_find(subject, "^`*", pos, endpos)
176 if not endchar then
177 return nil
178 end
179 if find(subject, "^%$%$", pos - 2) and
180 not find(subject, "^\\", pos - 3) then
181 self.matches[pos - 2] = nil
182 self.matches[pos - 1] = nil
183 self:add_match(pos - 2, endchar, "+display_math")
184 self.verbatim_type = "display_math"
185 elseif find(subject, "^%$", pos - 1) then
186 self.matches[pos - 1] = nil
187 self:add_match(pos - 1, endchar, "+inline_math")
188 self.verbatim_type = "inline_math"
189 else
190 self:add_match(pos, endchar, "+verbatim")
191 self.verbatim_type = "verbatim"
192 end
193 self.verbatim = endchar - pos + 1
194 return endchar + 1
195 end,
196
197 -- 92 = \
198 [92] = function(self, pos, endpos)
199 local subject = self.subject
200 local _, endchar = bounded_find(subject, "^[ \t]*\r?\n", pos + 1, endpos)
201 self:add_match(pos, pos, "escape")
202 if endchar then
203 -- see if there were preceding spaces
204 if #self.matches > 0 then
205 local sp, ep, annot = unpack(self.matches[#self.matches])
206 if annot == "str" then
207 while ep >= sp and
208 (subject:byte(ep) == 32 or subject:byte(ep) == 9) do
209 ep = ep -1
210 end
211 if ep < sp then
212 self.matches[#self.matches] = nil
213 else
214 self:add_match(sp, ep, "str")
215 end
216 end
217 end
218 self:add_match(pos + 1, endchar, "hardbreak")
219 return endchar + 1
220 else
221 local _, ec = bounded_find(subject, "^[%p ]", pos + 1, endpos)
222 if not ec then
223 self:add_match(pos, pos, "str")
224 return pos + 1
225 else
226 self:add_match(pos, pos, "escape")
227 if find(subject, "^ ", pos + 1) then
228 self:add_match(pos + 1, ec, "nbsp")
229 else
230 self:add_match(pos + 1, ec, "str")
231 end
232 return ec + 1
233 end
234 end
235 end,
236
237 -- 60 = <
238 [60] = function(self, pos, endpos)
239 local subject = self.subject
240 local starturl, endurl =
241 bounded_find(subject, "^%<[^<>%s]+%>", pos, endpos)
242 if starturl then
243 local is_url = bounded_find(subject, "^%a+:", pos + 1, endurl)
244 local is_email = bounded_find(subject, "^[^:]+%@", pos + 1, endurl)
245 if is_email then
246 self:add_match(starturl, starturl, "+email")
247 self:add_match(starturl + 1, endurl - 1, "str")
248 self:add_match(endurl, endurl, "-email")
249 return endurl + 1
250 elseif is_url then
251 self:add_match(starturl, starturl, "+url")
252 self:add_match(starturl + 1, endurl - 1, "str")
253 self:add_match(endurl, endurl, "-url")
254 return endurl + 1
255 end
256 end
257 end,
258
259 -- 126 = ~
260 [126] = InlineParser.between_matched('~', 'subscript'),
261
262 -- 94 = ^
263 [94] = InlineParser.between_matched('^', 'superscript'),
264
265 -- 91 = [
266 [91] = function(self, pos, endpos)
267 local sp, ep = bounded_find(self.subject, "^%^([^]]+)%]", pos + 1, endpos)
268 if sp then -- footnote ref
269 self:add_match(pos, ep, "footnote_reference")
270 return ep + 1
271 else
272 self:add_opener("[", pos, pos)
273 self:add_match(pos, pos, "str")
274 return pos + 1
275 end
276 end,
277
278 -- 93 = ]
279 [93] = function(self, pos, endpos)
280 local openers = self.openers["["]
281 local subject = self.subject
282 if openers and #openers > 0 then
283 local opener = openers[#openers]
284 if opener[3] == "reference_link" then
285 -- found a reference link
286 -- add the matches
287 local is_image = bounded_find(subject, "^!", opener[1] - 1, endpos)
288 and not bounded_find(subject, "^[\\]", opener[1] - 2, endpos)
289 if is_image then
290 self:add_match(opener[1] - 1, opener[1] - 1, "image_marker")
291 self:add_match(opener[1], opener[2], "+imagetext")
292 self:add_match(opener[4], opener[4], "-imagetext")
293 else
294 self:add_match(opener[1], opener[2], "+linktext")
295 self:add_match(opener[4], opener[4], "-linktext")
296 end
297 self:add_match(opener[5], opener[5], "+reference")
298 self:add_match(pos, pos, "-reference")
299 -- convert all matches to str
300 self:str_matches(opener[5] + 1, pos - 1)
301 -- remove from openers
302 self:clear_openers(opener[1], pos)
303 return pos + 1
304 elseif bounded_find(subject, "^%[", pos + 1, endpos) then
305 opener[3] = "reference_link"
306 opener[4] = pos -- intermediate ]
307 opener[5] = pos + 1 -- intermediate [
308 self:add_match(pos, pos + 1, "str")
309 -- remove any openers between [ and ]
310 self:clear_openers(opener[1] + 1, pos - 1)
311 return pos + 2
312 elseif bounded_find(subject, "^%(", pos + 1, endpos) then
313 self.openers["("] = {} -- clear ( openers
314 opener[3] = "explicit_link"
315 opener[4] = pos -- intermediate ]
316 opener[5] = pos + 1 -- intermediate (
317 self.destination = true
318 self:add_match(pos, pos + 1, "str")
319 -- remove any openers between [ and ]
320 self:clear_openers(opener[1] + 1, pos - 1)
321 return pos + 2
322 elseif bounded_find(subject, "^%{", pos + 1, endpos) then
323 -- assume this is attributes, bracketed span
324 self:add_match(opener[1], opener[2], "+span")
325 self:add_match(pos, pos, "-span")
326 -- remove any openers between [ and ]
327 self:clear_openers(opener[1], pos)
328 return pos + 1
329 end
330 end
331 end,
332
333
334 -- 40 = (
335 [40] = function(self, pos)
336 if not self.destination then return nil end
337 self:add_opener("(", pos, pos)
338 self:add_match(pos, pos, "str")
339 return pos + 1
340 end,
341
342 -- 41 = )
343 [41] = function(self, pos, endpos)
344 if not self.destination then return nil end
345 local parens = self.openers["("]
346 if parens and #parens > 0 and parens[#parens][1] then
347 parens[#parens] = nil -- clear opener
348 self:add_match(pos, pos, "str")
349 return pos + 1
350 else
351 local subject = self.subject
352 local openers = self.openers["["]
353 if openers and #openers > 0
354 and openers[#openers][3] == "explicit_link" then
355 local opener = openers[#openers]
356 -- we have inline link
357 local is_image = bounded_find(subject, "^!", opener[1] - 1, endpos)
358 and not bounded_find(subject, "^[\\]", opener[1] - 2, endpos)
359 if is_image then
360 self:add_match(opener[1] - 1, opener[1] - 1, "image_marker")
361 self:add_match(opener[1], opener[2], "+imagetext")
362 self:add_match(opener[4], opener[4], "-imagetext")
363 else
364 self:add_match(opener[1], opener[2], "+linktext")
365 self:add_match(opener[4], opener[4], "-linktext")
366 end
367 self:add_match(opener[5], opener[5], "+destination")
368 self:add_match(pos, pos, "-destination")
369 self.destination = false
370 -- convert all matches to str
371 self:str_matches(opener[5] + 1, pos - 1)
372 -- remove from openers
373 self:clear_openers(opener[1], pos)
374 return pos + 1
375 end
376 end
377 end,
378
379 -- 95 = _
380 [95] = InlineParser.between_matched('_', 'emph'),
381
382 -- 42 = *
383 [42] = InlineParser.between_matched('*', 'strong'),
384
385 -- 123 = {
386 [123] = function(self, pos, endpos)
387 if bounded_find(self.subject, "^[_*~^+='\"-]", pos + 1, endpos) then
388 self:add_match(pos, pos, "open_marker")
389 return pos + 1
390 elseif self.allow_attributes then
391 self.attribute_parser = attributes.AttributeParser:new(self.subject)
392 self.attribute_start = pos
393 self.attribute_slices = {}
394 return pos
395 else
396 self:add_match(pos, pos, "str")
397 return pos + 1
398 end
399 end,
400
401 -- 58 = :
402 [58] = function(self, pos, endpos)
403 local sp, ep = bounded_find(self.subject, "^%:[%w_+-]+%:", pos, endpos)
404 if sp then
405 self:add_match(sp, ep, "symbol")
406 return ep + 1
407 else
408 self:add_match(pos, pos, "str")
409 return pos + 1
410 end
411 end,
412
413 -- 43 = +
414 [43] = InlineParser.between_matched("+", "insert", "str",
415 function(self, pos)
416 return find(self.subject, "^%{", pos - 1) or
417 find(self.subject, "^%}", pos + 1)
418 end),
419
420 -- 61 = =
421 [61] = InlineParser.between_matched("=", "mark", "str",
422 function(self, pos)
423 return find(self.subject, "^%{", pos - 1) or
424 find(self.subject, "^%}", pos + 1)
425 end),
426
427 -- 39 = '
428 [39] = InlineParser.between_matched("'", "single_quoted", "right_single_quote",
429 function(self, pos) -- test to open
430 return pos == 1 or
431 find(self.subject, "^[%s\"'-([]", pos - 1)
432 end),
433
434 -- 34 = "
435 [34] = InlineParser.between_matched('"', "double_quoted", "left_double_quote"),
436
437 -- 45 = -
438 [45] = function(self, pos, endpos)
439 local subject = self.subject
440 local nextpos
441 if byte(subject, pos - 1) == 123 or
442 byte(subject, pos + 1) == 125 then -- (123 = { 125 = })
443 nextpos = InlineParser.between_matched("-", "delete", "str",
444 function(slf, p)
445 return find(slf.subject, "^%{", p - 1) or
446 find(slf.subject, "^%}", p + 1)
447 end)(self, pos, endpos)
448 return nextpos
449 end
450 -- didn't match a del, try for smart hyphens:
451 local _, ep = find(subject, "^%-*", pos)
452 if endpos < ep then
453 ep = endpos
454 end
455 local hyphens = 1 + ep - pos
456 if byte(subject, ep + 1) == 125 then -- 125 = }
457 hyphens = hyphens - 1 -- last hyphen is close del
458 end
459 if hyphens == 0 then -- this means we have '-}'
460 self:add_match(pos, pos + 1, "str")
461 return pos + 2
462 end
463 -- Try to construct a homogeneous sequence of dashes
464 local all_em = hyphens % 3 == 0
465 local all_en = hyphens % 2 == 0
466 while hyphens > 0 do
467 if all_em then
468 self:add_match(pos, pos + 2, "em_dash")
469 pos = pos + 3
470 hyphens = hyphens - 3
471 elseif all_en then
472 self:add_match(pos, pos + 1, "en_dash")
473 pos = pos + 2
474 hyphens = hyphens - 2
475 elseif hyphens >= 3 and (hyphens % 2 ~= 0 or hyphens > 4) then
476 self:add_match(pos, pos + 2, "em_dash")
477 pos = pos + 3
478 hyphens = hyphens - 3
479 elseif hyphens >= 2 then
480 self:add_match(pos, pos + 1, "en_dash")
481 pos = pos + 2
482 hyphens = hyphens - 2
483 else
484 self:add_match(pos, pos, "str")
485 pos = pos + 1
486 hyphens = hyphens - 1
487 end
488 end
489 return pos
490 end,
491
492 -- 46 = .
493 [46] = function(self, pos, endpos)
494 if bounded_find(self.subject, "^%.%.", pos + 1, endpos) then
495 self:add_match(pos, pos +2, "ellipses")
496 return pos + 3
497 end
498 end
499 }
500
501function InlineParser:single_char(pos)
502 self:add_match(pos, pos, "str")
503 return pos + 1
504end
505
506-- Reparse attribute_slices that we tried to parse as an attribute
507function InlineParser:reparse_attributes()
508 local slices = self.attribute_slices
509 if not slices then
510 return
511 end
512 self.allow_attributes = false
513 self.attribute_parser = nil
514 self.attribute_start = nil
515 if slices then
516 for i=1,#slices do
517 self:feed(unpack(slices[i]))
518 end
519 end
520 self.allow_attributes = true
521 self.attribute_slices = nil
522end
523
524-- Feed a slice to the parser, updating state.
525function InlineParser:feed(spos, endpos)
526 local special = "[][\\`{}_*()!<>~^:=+$\r\n'\".-]"
527 local subject = self.subject
528 local matchers = self.matchers
529 local pos
530 if self.firstpos == 0 or spos < self.firstpos then
531 self.firstpos = spos
532 end
533 if self.lastpos == 0 or endpos > self.lastpos then
534 self.lastpos = endpos
535 end
536 pos = spos
537 while pos <= endpos do
538 if self.attribute_parser then
539 local sp = pos
540 local ep2 = bounded_find(subject, special, pos, endpos)
541 if not ep2 or ep2 > endpos then
542 ep2 = endpos
543 end
544 local status, ep = self.attribute_parser:feed(sp, ep2)
545 if status == "done" then
546 local attribute_start = self.attribute_start
547 -- add attribute matches
548 self:add_match(attribute_start, attribute_start, "+attributes")
549 self:add_match(ep, ep, "-attributes")
550 local attr_matches = self.attribute_parser:get_matches()
551 -- add attribute matches
552 for i=1,#attr_matches do
553 self:add_match(unpack(attr_matches[i]))
554 end
555 -- restore state to prior to adding attribute parser:
556 self.attribute_parser = nil
557 self.attribute_start = nil
558 self.attribute_slices = nil
559 pos = ep + 1
560 elseif status == "fail" then
561 self:reparse_attributes()
562 pos = sp -- we'll want to go over the whole failed portion again,
563 -- as no slice was added for it
564 elseif status == "continue" then
565 if #self.attribute_slices == 0 then
566 self.attribute_slices = {}
567 end
568 self.attribute_slices[#self.attribute_slices + 1] = {sp,ep}
569 pos = ep + 1
570 end
571 else
572 -- find next interesting character:
573 local newpos = bounded_find(subject, special, pos, endpos) or endpos + 1
574 if newpos > pos then
575 self:add_match(pos, newpos - 1, "str")
576 pos = newpos
577 if pos > endpos then
578 break -- otherwise, fall through:
579 end
580 end
581 -- if we get here, then newpos = pos,
582 -- i.e. we have something interesting at pos
583 local c = byte(subject, pos)
584
585 if c == 13 or c == 10 then -- cr or lf
586 if c == 13 and bounded_find(subject, "^[%n]", pos + 1, endpos) then
587 self:add_match(pos, pos + 1, "softbreak")
588 pos = pos + 2
589 else
590 self:add_match(pos, pos, "softbreak")
591 pos = pos + 1
592 end
593 elseif self.verbatim > 0 then
594 if c == 96 then
595 local _, endchar = bounded_find(subject, "^`+", pos, endpos)
596 if endchar and endchar - pos + 1 == self.verbatim then
597 -- check for raw attribute
598 local sp, ep =
599 bounded_find(subject, "^%{%=[^%s{}`]+%}", endchar + 1, endpos)
600 if sp and self.verbatim_type == "verbatim" then -- raw
601 self:add_match(pos, endchar, "-" .. self.verbatim_type)
602 self:add_match(sp, ep, "raw_format")
603 pos = ep + 1
604 else
605 self:add_match(pos, endchar, "-" .. self.verbatim_type)
606 pos = endchar + 1
607 end
608 self.verbatim = 0
609 self.verbatim_type = nil
610 else
611 endchar = endchar or endpos
612 self:add_match(pos, endchar, "str")
613 pos = endchar + 1
614 end
615 else
616 self:add_match(pos, pos, "str")
617 pos = pos + 1
618 end
619 else
620 local matcher = matchers[c]
621 pos = (matcher and matcher(self, pos, endpos)) or self:single_char(pos)
622 end
623 end
624 end
625end
626
627 -- Return true if we're parsing verbatim content.
628function InlineParser:in_verbatim()
629 return self.verbatim > 0
630end
631
632function InlineParser:get_matches()
633 local sorted = {}
634 local subject = self.subject
635 local lastsp, lastep, lastannot
636 if self.attribute_parser then -- we're still in an attribute parse
637 self:reparse_attributes()
638 end
639 for i=self.firstpos, self.lastpos do
640 if self.matches[i] then
641 local sp, ep, annot = unpack(self.matches[i])
642 if annot == "str" and lastannot == "str" and lastep + 1 == sp then
643 -- consolidate adjacent strs
644 sorted[#sorted] = {lastsp, ep, annot}
645 lastsp, lastep, lastannot = lastsp, ep, annot
646 else
647 sorted[#sorted + 1] = self.matches[i]
648 lastsp, lastep, lastannot = sp, ep, annot
649 end
650 end
651 end
652 if #sorted > 0 then
653 local last = sorted[#sorted]
654 local startpos, endpos, annot = unpack(last)
655 -- remove final softbreak
656 if annot == "softbreak" then
657 sorted[#sorted] = nil
658 last = sorted[#sorted]
659 if not last then
660 return sorted
661 end
662 startpos, endpos, annot = unpack(last)
663 end
664 -- remove trailing spaces
665 if annot == "str" and byte(subject, endpos) == 32 then
666 while endpos > startpos and byte(subject, endpos) == 32 do
667 endpos = endpos - 1
668 end
669 sorted[#sorted] = {startpos, endpos, annot}
670 end
671 if self.verbatim > 0 then -- unclosed verbatim
672 self.warn({ message = "Unclosed verbatim", pos = endpos })
673 sorted[#sorted + 1] = {endpos, endpos, "-" .. self.verbatim_type}
674 end
675 end
676 return sorted
677end
678
679return { InlineParser = InlineParser }