-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtoc.py
More file actions
126 lines (105 loc) · 4.54 KB
/
toc.py
File metadata and controls
126 lines (105 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
'''Package toc:
TOC stores various information about the table of contents.'''
import re
import utils
ANNEXREGEX = r"[A-Z]\b"
CHAPTERREGEX = r"\d+"
KEYREGEX = fr"(?:(?:{CHAPTERREGEX})|(?:{ANNEXREGEX}))(?:\.\d+)+"
PAGENUMREGEX = r"[0-9ivx]+"
TITLEREGEX = r"\w(?:[^ .][ .]?)+[^ .]"
UNKEYEDCHAPTERRE = re.compile(fr"({TITLEREGEX})\s+({PAGENUMREGEX})")
CHAPTERRE = re.compile(fr"({CHAPTERREGEX})\s+({TITLEREGEX})\s+({PAGENUMREGEX})")
SECTIONRE = re.compile(fr"\s*({KEYREGEX})\s+({TITLEREGEX})\s+(?:\. )*\s*({PAGENUMREGEX})")
class TOC:
'''A representation of the table of contents.
Attributes:
- titleline: str, the title line
- titles: list of tuple (str, None or str), the association of title and
key; some titles have no key
Some titles are duplicate.'''
def __init__(self, tocpages):
'''Parameters:
- tocpages: list of page.Page, the TOC pages'''
contents = sum(map((lambda p: p.content), tocpages), start=[])
self.titleline = contents[0]
self.titles = list()
for line in contents[1:]:
if not line:
continue
try:
if m := UNKEYEDCHAPTERRE.fullmatch(line):
title, page = m.groups()
if title.startswith("Annex "):
self.titles.append((title, title[6]))
else:
self.titles.append((title, None))
elif m := CHAPTERRE.fullmatch(line):
key, title, page = m.groups()
self.titles.append((title, key))
elif m := SECTIONRE.fullmatch(line):
key, title, page = m.groups()
self.titles.append((title, key))
else:
print("Unrecognized TOC pattern:", repr(line))
except:
print(line)
raise
class TOCMatcher:
'''A tool to match TOC entries and headings in the contents
This object helps identify the headings in the content, whether because they
are referenced in the TOC, or because they are a subheading of the last
matched entry from the TOC.'''
def __init__(self, toc):
'''Parameters:
- toc: TOC, the table of contents'''
def maketitleregex(title, key):
if title[:6] == "Annex ":
return re.compile(fr"^\s+{title[:7]}$")
if key is None:
return re.compile(fr"^\s*{title}$")
if key.count('.') == 0:
return re.compile(fr"^\s*{key}\.\s+{title}$")
return re.compile(fr"^\s*{key}\s+{title}$")
self._titlestack = [maketitleregex(t, k)
for t, k in reversed(toc.titles)]
def makeheadingregex(key):
if key is None:
return None
return re.compile(fr"^\s*{key}(\.\d+)+($|\s)")
self._headingstack = [makeheadingregex(k)
for _, k in reversed(toc.titles)]
# At the beginning, before any title has been matched, we shall not
# match against any heading
self._headingstack += [None]
def matchtitle(self, line):
'''matchtitle(self, line): Match a line against the next TOC entry.
Parameters:
- line: str, the line to match
Return: bool, True if the line matched
⚠️ This function modifies the object. Consequent calls with similar
inputs may not give the same output.⚠️
Only the top entry is matched. If the match is successful, the entry is
removed from the stack.'''
if self._titlestack:
if self._titlestack[-1].fullmatch(line):
self._titlestack.pop()
self._headingstack.pop()
return True
return False
def matchheading(self, line):
'''matchheading(self, line): Match a line under the latest TOC entry.
Parameters:
- line: str, the line to match
Return: bool, True if the line matched
Only the top entry is matched.'''
if self._headingstack and self._headingstack[-1] is not None:
if self._headingstack[-1].match(line):
# Fix N3220 6.7.3.1p5: it is not a title, just unfortunate
# reference
if line.startswith('6.7.3.2 through 6.7.3.6'):
return False
# Fix N3220 H.12.1p4: ditto
if line.startswith('H.12.5 (see H.8).'):
return False
return True
return False