Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18 """
19 Helper class to simplify regex declaration and usage.
20
21 It calls re.compile for a given pattern. It also allows adding
22 regular expressions and define sub at class init time.
23
24 Regular expressions can be cached via an argument, helping to speedup
25 searches.
26 """
27
28 def _add_regex(self, string, flags):
29 """
30 Adds a new regex or reuses it from the cache.
31 """
32 self.regex = re_cache.get(string, None)
33 if not self.regex:
34 self.regex = re.compile(string, flags=flags)
35 if self.cache:
36 re_cache[string] = self.regex
37
38 def __init__(self, string, cache=True, flags=0):
39 """
40 Compile a regular expression and initialize internal vars.
41 """
42
43 self.cache = cache
44 self.last_match = None
45
46 self._add_regex(string, flags)
47
48 def __str__(self):
49 """
50 Return the regular expression pattern.
51 """
52 return self.regex.pattern
53
54 def __repr__(self):
55 return f're.compile("{self.regex.pattern}")'
56
57 def __add__(self, other):
58 """
59 Allows adding two regular expressions into one.
60 """
61
62 return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63 flags=self.regex.flags | other.regex.flags)
64
65 def match(self, string):
66 """
67 Handles a re.match storing its results.
68 """
69
70 self.last_match = self.regex.match(string)
71 return self.last_match
72
73 def search(self, string):
74 """
75 Handles a re.search storing its results.
76 """
77
78 self.last_match = self.regex.search(string)
79 return self.last_match
80
81 def findall(self, string):
82 """
83 Alias to re.findall.
84 """
85
86 return self.regex.findall(string)
87
88 def split(self, string):
89 """
90 Alias to re.split.
91 """
92
93 return self.regex.split(string)
94
95 def sub(self, sub, string, count=0):
96 """
97 Alias to re.sub.
98 """
99
100 return self.regex.sub(sub, string, count=count)
101
102 def group(self, num):
103 """
104 Returns the group results of the last match.
105 """
106
107 return self.last_match.group(num)
108
109
110class NestedMatch:
111 """
112 Finding nested delimiters is hard with regular expressions. It is
113 even harder on Python with its normal re module, as there are several
114 advanced regular expressions that are missing.
115
116 This is the case of this pattern::
117
118 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
119
120 which is used to properly match open/close parentheses of the
121 string search STRUCT_GROUP(),
122
123 Add a class that counts pairs of delimiters, using it to match and
124 replace nested expressions.
125
126 The original approach was suggested by:
127
128 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
129
130 Although I re-implemented it to make it more generic and match 3 types
131 of delimiters. The logic checks if delimiters are paired. If not, it
132 will ignore the search string.
133 """
134
135 # TODO: make NestedMatch handle multiple match groups
136 #
137 # Right now, regular expressions to match it are defined only up to
138 # the start delimiter, e.g.:
139 #
140 # \bSTRUCT_GROUP\(
141 #
142 # is similar to: STRUCT_GROUP\((.*)\)
143 # except that the content inside the match group is delimiter-aligned.
144 #
145 # The content inside parentheses is converted into a single replace
146 # group (e.g. r`\1').
147 #
148 # It would be nice to change such definition to support multiple
149 # match groups, allowing a regex equivalent to:
150 #
151 # FOO\((.*), (.*), (.*)\)
152 #
153 # it is probably easier to define it not as a regular expression, but
154 # with some lexical definition like:
155 #
156 # FOO(arg1, arg2, arg3)
157
158 DELIMITER_PAIRS = {
159 '{': '}',
160 '(': ')',
161 '[': ']',
162 }
163
164 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
165
166 def _search(self, regex, line):
167 """
168 Finds paired blocks for a regex that ends with a delimiter.
169
170 The suggestion of using finditer to match pairs came from:
171 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
172 but I ended using a different implementation to align all three types
173 of delimiters and seek for an initial regular expression.
174
175 The algorithm seeks for open/close paired delimiters and places them
176 into a stack, yielding a start/stop position of each match when the
177 stack is zeroed.
178
179 The algorithm should work fine for properly paired lines, but will
180 silently ignore end delimiters that precede a start delimiter.
181 This should be OK for kernel-doc parser, as unaligned delimiters
182 would cause compilation errors. So, we don't need to raise exceptions
183 to cover such issues.
184 """
185
186 stack = []
187
188 for match_re in regex.finditer(line):
189 start = match_re.start()
190 offset = match_re.end()
191
192 d = line[offset - 1]
193 if d not in self.DELIMITER_PAIRS:
194 continue
195
196 end = self.DELIMITER_PAIRS[d]
197 stack.append(end)
198
199 for match in self.RE_DELIM.finditer(line[offset:]):
200 pos = match.start() + offset
201
202 d = line[pos]
203
204 if d in self.DELIMITER_PAIRS:
205 end = self.DELIMITER_PAIRS[d]
206
207 stack.append(end)
208 continue
209
210 # Does the end delimiter match what is expected?
211 if stack and d == stack[-1]:
212 stack.pop()
213
214 if not stack:
215 yield start, offset, pos + 1
216 break
217
218 def search(self, regex, line):
219 """
220 This is similar to re.search:
221
222 It matches a regex that it is followed by a delimiter,
223 returning occurrences only if all delimiters are paired.
224 """
225
226 for t in self._search(regex, line):
227
228 yield line[t[0]:t[2]]
229
230 def sub(self, regex, sub, line, count=0):
231 r"""
232 This is similar to re.sub:
233
234 It matches a regex that it is followed by a delimiter,
235 replacing occurrences only if all delimiters are paired.
236
237 if the sub argument contains::
238
239 r'\1'
240
241 it will work just like re: it places there the matched paired data
242 with the delimiter stripped.
243
244 If count is different than zero, it will replace at most count
245 items.
246 """
247 out = ""
248
249 cur_pos = 0
250 n = 0
251
252 for start, end, pos in self._search(regex, line):
253 out += line[cur_pos:start]
254
255 # Value, ignoring start/end delimiters
256 value = line[end:pos - 1]
257
258 # replaces \1 at the sub string, if \1 is used there
259 new_sub = sub
260 new_sub = new_sub.replace(r'\1', value)
261
262 out += new_sub
263
264 # Drop end ';' if any
265 if line[pos] == ';':
266 pos += 1
267
268 cur_pos = pos
269 n += 1
270
271 if count and count >= n:
272 break
273
274 # Append the remaining string
275 l = len(line)
276 out += line[cur_pos:l]
277
278 return out