Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from copy import copy
20
21from .kdoc_re import KernRe
22
23log = logging.getLogger(__name__)
24
25def tokenizer_set_log(logger, prefix = ""):
26 """
27 Replace the module‑level logger with a LoggerAdapter that
28 prepends *prefix* to every message.
29 """
30 global log
31
32 class PrefixAdapter(logging.LoggerAdapter):
33 """
34 Ancillary class to set prefix on all message logs.
35 """
36 def process(self, msg, kwargs):
37 return f"{prefix}{msg}", kwargs
38
39 # Wrap the provided logger in our adapter
40 log = PrefixAdapter(logger, {"prefix": prefix})
41
42class CToken():
43 """
44 Data class to define a C token.
45 """
46
47 # Tokens that can be used by the parser. Works like an C enum.
48
49 COMMENT = 0 #: A standard C or C99 comment, including delimiter.
50 STRING = 1 #: A string, including quotation marks.
51 CHAR = 2 #: A character, including apostophes.
52 NUMBER = 3 #: A number.
53 PUNC = 4 #: A puntuation mark: / ``,`` / ``.``.
54 BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``.
55 END = 6 #: A end character: ``}`` / ``]`` / ``)``.
56 CPP = 7 #: A preprocessor macro.
57 HASH = 8 #: The hash character - useful to handle other macros.
58 OP = 9 #: A C operator (add, subtract, ...).
59 STRUCT = 10 #: A ``struct`` keyword.
60 UNION = 11 #: An ``union`` keyword.
61 ENUM = 12 #: A ``struct`` keyword.
62 TYPEDEF = 13 #: A ``typedef`` keyword.
63 NAME = 14 #: A name. Can be an ID or a type.
64 SPACE = 15 #: Any space characters, including new lines
65 ENDSTMT = 16 #: End of an statement (``;``).
66
67 BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns.
68
69 MISMATCH = 255 #: an error indicator: should never happen in practice.
70
71 # Dict to convert from an enum interger into a string.
72 _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73
74 # Dict to convert from string to an enum-like integer value.
75 _name_to_val = {k: v for v, k in _name_by_val.items()}
76
77 @staticmethod
78 def to_name(val):
79 """Convert from an integer value from CToken enum into a string"""
80
81 return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82
83 @staticmethod
84 def from_name(name):
85 """Convert a string into a CToken enum value"""
86 if name in CToken._name_to_val:
87 return CToken._name_to_val[name]
88
89 return CToken.MISMATCH
90
91
92 def __init__(self, kind, value=None, pos=0,
93 brace_level=0, paren_level=0, bracket_level=0):
94 self.kind = kind
95 self.value = value
96 self.pos = pos
97 self.level = (bracket_level, paren_level, brace_level)
98
99 def __repr__(self):
100 name = self.to_name(self.kind)
101 if isinstance(self.value, str):
102 value = '"' + self.value + '"'
103 else:
104 value = self.value
105
106 return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107
108#: Regexes to parse C code, transforming it into tokens.
109RE_SCANNER_LIST = [
110 #
111 # Note that \s\S is different than .*, as it also catches \n
112 #
113 (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114
115 (CToken.STRING, r'"(?:\\.|[^"\\])*"'),
116 (CToken.CHAR, r"'(?:\\.|[^'\\])'"),
117
118 (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119 r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120
121 (CToken.ENDSTMT, r"(?:\s+;|;)"),
122
123 (CToken.PUNC, r"[,\.]"),
124
125 (CToken.BEGIN, r"[\[\(\{]"),
126
127 (CToken.END, r"[\]\)\}]"),
128
129 (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130
131 (CToken.HASH, r"#"),
132
133 (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134 r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135
136 (CToken.STRUCT, r"\bstruct\b"),
137 (CToken.UNION, r"\bunion\b"),
138 (CToken.ENUM, r"\benum\b"),
139 (CToken.TYPEDEF, r"\btypedef\b"),
140
141 (CToken.NAME, r"[A-Za-z_]\w*"),
142
143 (CToken.SPACE, r"\s+"),
144
145 (CToken.BACKREF, r"\\\d+"),
146
147 (CToken.MISMATCH,r"."),
148]
149
150def fill_re_scanner(token_list):
151 """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152 re_tokens = []
153
154 for kind, pattern in token_list:
155 name = CToken.to_name(kind)
156 re_tokens.append(f"(?P<{name}>{pattern})")
157
158 return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159
160#: Handle C continuation lines.
161RE_CONT = KernRe(r"\\\n")
162
163RE_COMMENT_START = KernRe(r'/\*\s*')
164
165#: tokenizer regex. Will be filled at the first CTokenizer usage.
166RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167
168
169class CTokenizer():
170 """
171 Scan C statements and definitions and produce tokens.
172
173 When converted to string, it drops comments and handle public/private
174 values, respecting depth.
175 """
176
177 # This class is inspired and follows the basic concepts of:
178 # https://docs.python.org/3/library/re.html#writing-a-tokenizer
179
180 def __init__(self, source=None):
181 """
182 Create a regular expression to handle RE_SCANNER_LIST.
183
184 While I generally don't like using regex group naming via:
185 (?P<name>...)
186
187 in this particular case, it makes sense, as we can pick the name
188 when matching a code via RE_SCANNER.
189 """
190
191 #
192 # Store logger to allow parser classes to re-use it
193 #
194 global log
195 self.log = log
196
197 self.tokens = []
198
199 if not source:
200 return
201
202 if isinstance(source, list):
203 self.tokens = source
204 return
205
206 #
207 # While we could just use _tokenize directly via interator,
208 # As we'll need to use the tokenizer several times inside kernel-doc
209 # to handle macro transforms, cache the results on a list, as
210 # re-using it is cheaper than having to parse everytime.
211 #
212 for tok in self._tokenize(source):
213 self.tokens.append(tok)
214
215 def _tokenize(self, source):
216 """
217 Iterator that parses ``source``, splitting it into tokens, as defined
218 at ``self.RE_SCANNER_LIST``.
219
220 The interactor returns a CToken class object.
221 """
222
223 # Handle continuation lines. Note that kdoc_parser already has a
224 # logic to do that. Still, let's keep it for completeness, as we might
225 # end re-using this tokenizer outsize kernel-doc some day - or we may
226 # eventually remove from there as a future cleanup.
227 source = RE_CONT.sub("", source)
228
229 brace_level = 0
230 paren_level = 0
231 bracket_level = 0
232
233 for match in RE_SCANNER.finditer(source):
234 kind = CToken.from_name(match.lastgroup)
235 pos = match.start()
236 value = match.group()
237
238 if kind == CToken.MISMATCH:
239 log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
240 elif kind == CToken.BEGIN:
241 if value == '(':
242 paren_level += 1
243 elif value == '[':
244 bracket_level += 1
245 else: # value == '{'
246 brace_level += 1
247
248 elif kind == CToken.END:
249 if value == ')' and paren_level > 0:
250 paren_level -= 1
251 elif value == ']' and bracket_level > 0:
252 bracket_level -= 1
253 elif brace_level > 0: # value == '}'
254 brace_level -= 1
255
256 yield CToken(kind, value, pos,
257 brace_level, paren_level, bracket_level)
258
259 def __str__(self):
260 out=""
261 show_stack = [True]
262
263 for i, tok in enumerate(self.tokens):
264 if tok.kind == CToken.BEGIN:
265 show_stack.append(show_stack[-1])
266
267 elif tok.kind == CToken.END:
268 prev = show_stack[-1]
269 if len(show_stack) > 1:
270 show_stack.pop()
271
272 if not prev and show_stack[-1]:
273 #
274 # Try to preserve indent
275 #
276 out += "\t" * (len(show_stack) - 1)
277
278 out += str(tok.value)
279 continue
280
281 elif tok.kind == CToken.COMMENT:
282 comment = RE_COMMENT_START.sub("", tok.value)
283
284 if comment.startswith("private:"):
285 show_stack[-1] = False
286 show = False
287 elif comment.startswith("public:"):
288 show_stack[-1] = True
289
290 continue
291
292 if not show_stack[-1]:
293 continue
294
295 if i < len(self.tokens) - 1:
296 next_tok = self.tokens[i + 1]
297
298 # Do some cleanups before ";"
299
300 if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
301 continue
302
303 if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
304 continue
305
306 out += str(tok.value)
307
308 return out
309
310
311class CTokenArgs:
312 """
313 Ancillary class to help using backrefs from sub matches.
314
315 If the highest backref contain a "+" at the last element,
316 the logic will be greedy, picking all other delims.
317
318 This is needed to parse struct_group macros with end with ``MEMBERS...``.
319 """
320 def __init__(self, sub_str):
321 self.sub_groups = set()
322 self.max_group = -1
323 self.greedy = None
324
325 for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
326 group = int(m.group(1))
327 if m.group(2) == "+":
328 if self.greedy and self.greedy != group:
329 raise ValueError("There are multiple greedy patterns!")
330 self.greedy = group
331
332 self.sub_groups.add(group)
333 self.max_group = max(self.max_group, group)
334
335 if self.greedy:
336 if self.greedy != self.max_group:
337 raise ValueError("Greedy pattern is not the last one!")
338
339 sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
340
341 self.sub_str = sub_str
342 self.sub_tokeninzer = CTokenizer(sub_str)
343
344 def groups(self, new_tokenizer):
345 r"""
346 Create replacement arguments for backrefs like:
347
348 ``\0``, ``\1``, ``\2``, ... ``\{number}``
349
350 It also accepts a ``+`` character to the highest backref, like
351 ``\4+``. When used, the backref will be greedy, picking all other
352 arguments afterwards.
353
354 The logic is smart enough to only go up to the maximum required
355 argument, even if there are more.
356
357 If there is a backref for an argument above the limit, it will
358 raise an exception. Please notice that, on C, square brackets
359 don't have any separator on it. Trying to use ``\1``..``\n`` for
360 brackets also raise an exception.
361 """
362
363 level = (0, 0, 0)
364
365 if self.max_group < 0:
366 return level, []
367
368 tokens = new_tokenizer.tokens
369
370 #
371 # Fill \0 with the full token contents
372 #
373 groups_list = [ [] ]
374
375 if 0 in self.sub_groups:
376 inner_level = 0
377
378 for i in range(0, len(tokens)):
379 tok = tokens[i]
380
381 if tok.kind == CToken.BEGIN:
382 inner_level += 1
383
384 #
385 # Discard first begin
386 #
387 if not groups_list[0]:
388 continue
389 elif tok.kind == CToken.END:
390 inner_level -= 1
391 if inner_level < 0:
392 break
393
394 if inner_level:
395 groups_list[0].append(tok)
396
397 if not self.max_group:
398 return level, groups_list
399
400 delim = None
401
402 #
403 # Ignore everything before BEGIN. The value of begin gives the
404 # delimiter to be used for the matches
405 #
406 for i in range(0, len(tokens)):
407 tok = tokens[i]
408 if tok.kind == CToken.BEGIN:
409 if tok.value == "{":
410 delim = ";"
411 elif tok.value == "(":
412 delim = ","
413 else:
414 self.log.error(fr"Can't handle \1..\n on {sub_str}")
415
416 level = tok.level
417 break
418
419 pos = 1
420 groups_list.append([])
421
422 inner_level = 0
423 for i in range(i + 1, len(tokens)):
424 tok = tokens[i]
425
426 if tok.kind == CToken.BEGIN:
427 inner_level += 1
428 if tok.kind == CToken.END:
429 inner_level -= 1
430 if inner_level < 0:
431 break
432
433 if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
434 pos += 1
435 if self.greedy and pos > self.max_group:
436 pos -= 1
437 else:
438 groups_list.append([])
439
440 if pos > self.max_group:
441 break
442
443 continue
444
445 groups_list[pos].append(tok)
446
447 if pos < self.max_group:
448 log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
449
450 return level, groups_list
451
452 def tokens(self, new_tokenizer):
453 level, groups = self.groups(new_tokenizer)
454
455 new = CTokenizer()
456
457 for tok in self.sub_tokeninzer.tokens:
458 if tok.kind == CToken.BACKREF:
459 group = int(tok.value[1:])
460
461 for group_tok in groups[group]:
462 new_tok = copy(group_tok)
463
464 new_level = [0, 0, 0]
465
466 for i in range(0, len(level)):
467 new_level[i] = new_tok.level[i] + level[i]
468
469 new_tok.level = tuple(new_level)
470
471 new.tokens += [ new_tok ]
472 else:
473 new.tokens += [ tok ]
474
475 return new.tokens
476
477
478class CMatch:
479 """
480 Finding nested delimiters is hard with regular expressions. It is
481 even harder on Python with its normal re module, as there are several
482 advanced regular expressions that are missing.
483
484 This is the case of this pattern::
485
486 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
487
488 which is used to properly match open/close parentheses of the
489 string search STRUCT_GROUP(),
490
491 Add a class that counts pairs of delimiters, using it to match and
492 replace nested expressions.
493
494 The original approach was suggested by:
495
496 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
497
498 Although I re-implemented it to make it more generic and match 3 types
499 of delimiters. The logic checks if delimiters are paired. If not, it
500 will ignore the search string.
501 """
502
503
504 def __init__(self, regex, delim="("):
505 self.regex = KernRe("^" + regex + r"\b")
506 self.start_delim = delim
507
508 def _search(self, tokenizer):
509 """
510 Finds paired blocks for a regex that ends with a delimiter.
511
512 The suggestion of using finditer to match pairs came from:
513 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
514 but I ended using a different implementation to align all three types
515 of delimiters and seek for an initial regular expression.
516
517 The algorithm seeks for open/close paired delimiters and places them
518 into a stack, yielding a start/stop position of each match when the
519 stack is zeroed.
520
521 The algorithm should work fine for properly paired lines, but will
522 silently ignore end delimiters that precede a start delimiter.
523 This should be OK for kernel-doc parser, as unaligned delimiters
524 would cause compilation errors. So, we don't need to raise exceptions
525 to cover such issues.
526 """
527
528 start = None
529 started = False
530
531 import sys
532
533 stack = []
534
535 for i, tok in enumerate(tokenizer.tokens):
536 if start is None:
537 if tok.kind == CToken.NAME and self.regex.match(tok.value):
538 start = i
539 stack.append((start, tok.level))
540 started = False
541
542 continue
543
544 if not started:
545 if tok.kind == CToken.SPACE:
546 continue
547
548 if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
549 started = True
550 continue
551
552 # Name only token without BEGIN/END
553 if i > start:
554 i -= 1
555 yield start, i
556 start = None
557
558 if tok.kind == CToken.END and tok.level == stack[-1][1]:
559 start, level = stack.pop()
560
561 yield start, i
562 start = None
563
564 #
565 # If an END zeroing levels is not there, return remaining stuff
566 # This is meant to solve cases where the caller logic might be
567 # picking an incomplete block.
568 #
569 if start and stack:
570 if started:
571 s = str(tokenizer)
572 log.warning(f"can't find a final end at {s}")
573
574 yield start, len(tokenizer.tokens)
575
576 def search(self, source):
577 """
578 This is similar to re.search:
579
580 It matches a regex that it is followed by a delimiter,
581 returning occurrences only if all delimiters are paired.
582 """
583
584 if isinstance(source, CTokenizer):
585 tokenizer = source
586 is_token = True
587 else:
588 tokenizer = CTokenizer(source)
589 is_token = False
590
591 for start, end in self._search(tokenizer):
592 new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
593
594 if is_token:
595 yield new_tokenizer
596 else:
597 yield str(new_tokenizer)
598
599 def sub(self, sub_str, source, count=0):
600 """
601 This is similar to re.sub:
602
603 It matches a regex that it is followed by a delimiter,
604 replacing occurrences only if all delimiters are paired.
605
606 if the sub argument contains::
607
608 r'\0'
609
610 it will work just like re: it places there the matched paired data
611 with the delimiter stripped.
612
613 If count is different than zero, it will replace at most count
614 items.
615 """
616 if isinstance(source, CTokenizer):
617 is_token = True
618 tokenizer = source
619 else:
620 is_token = False
621 tokenizer = CTokenizer(source)
622
623 # Detect if sub_str contains sub arguments
624
625 args_match = CTokenArgs(sub_str)
626
627 new_tokenizer = CTokenizer()
628 pos = 0
629 n = 0
630
631 #
632 # NOTE: the code below doesn't consider overlays at sub.
633 # We may need to add some extra unit tests to check if those
634 # would cause problems. When replacing by "", this should not
635 # be a problem, but other transformations could be problematic
636 #
637 for start, end in self._search(tokenizer):
638 new_tokenizer.tokens += tokenizer.tokens[pos:start]
639
640 new = CTokenizer(tokenizer.tokens[start:end + 1])
641
642 new_tokenizer.tokens += args_match.tokens(new)
643
644 pos = end + 1
645
646 n += 1
647 if count and n >= count:
648 break
649
650 new_tokenizer.tokens += tokenizer.tokens[pos:]
651
652 if not is_token:
653 return str(new_tokenizer)
654
655 return new_tokenizer
656
657 def __repr__(self):
658 """
659 Returns a displayable version of the class init.
660 """
661
662 return f'CMatch("{self.regex.regex.pattern}")'