Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

docs: add a C tokenizer to be used by kernel-doc

Handling C code purely using regular expressions doesn't work well.

Add a C tokenizer to help doing it the right way.

The tokenizer was written using as basis the Python re documentation
tokenizer example from:
https://docs.python.org/3/library/re.html#writing-a-tokenizer

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <39787bb8022e10c65df40c746077f7f66d07ffed.1773770483.git.mchehab+huawei@kernel.org>

authored by

Mauro Carvalho Chehab and committed by
Jonathan Corbet
df50e848 d5265f7a

+292
+292
tools/lib/python/kdoc/c_lex.py
··· 1 + #!/usr/bin/env python3 2 + # SPDX-License-Identifier: GPL-2.0 3 + # Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 + 5 + """ 6 + Regular expression ancillary classes. 7 + 8 + Those help caching regular expressions and do matching for kernel-doc. 9 + 10 + Please notice that the code here may rise exceptions to indicate bad 11 + usage inside kdoc to indicate problems at the replace pattern. 12 + 13 + Other errors are logged via log instance. 14 + """ 15 + 16 + import logging 17 + import re 18 + 19 + from .kdoc_re import KernRe 20 + 21 + log = logging.getLogger(__name__) 22 + 23 + 24 + class CToken(): 25 + """ 26 + Data class to define a C token. 27 + """ 28 + 29 + # Tokens that can be used by the parser. Works like an C enum. 30 + 31 + COMMENT = 0 #: A standard C or C99 comment, including delimiter. 32 + STRING = 1 #: A string, including quotation marks. 33 + CHAR = 2 #: A character, including apostophes. 34 + NUMBER = 3 #: A number. 35 + PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 36 + BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 37 + END = 6 #: A end character: ``}`` / ``]`` / ``)``. 38 + CPP = 7 #: A preprocessor macro. 39 + HASH = 8 #: The hash character - useful to handle other macros. 40 + OP = 9 #: A C operator (add, subtract, ...). 41 + STRUCT = 10 #: A ``struct`` keyword. 42 + UNION = 11 #: An ``union`` keyword. 43 + ENUM = 12 #: A ``struct`` keyword. 44 + TYPEDEF = 13 #: A ``typedef`` keyword. 45 + NAME = 14 #: A name. Can be an ID or a type. 46 + SPACE = 15 #: Any space characters, including new lines 47 + ENDSTMT = 16 #: End of an statement (``;``). 48 + 49 + BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 50 + 51 + MISMATCH = 255 #: an error indicator: should never happen in practice. 52 + 53 + # Dict to convert from an enum interger into a string. 54 + _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 55 + 56 + # Dict to convert from string to an enum-like integer value. 57 + _name_to_val = {k: v for v, k in _name_by_val.items()} 58 + 59 + @staticmethod 60 + def to_name(val): 61 + """Convert from an integer value from CToken enum into a string""" 62 + 63 + return CToken._name_by_val.get(val, f"UNKNOWN({val})") 64 + 65 + @staticmethod 66 + def from_name(name): 67 + """Convert a string into a CToken enum value""" 68 + if name in CToken._name_to_val: 69 + return CToken._name_to_val[name] 70 + 71 + return CToken.MISMATCH 72 + 73 + 74 + def __init__(self, kind, value=None, pos=0, 75 + brace_level=0, paren_level=0, bracket_level=0): 76 + self.kind = kind 77 + self.value = value 78 + self.pos = pos 79 + self.level = (bracket_level, paren_level, brace_level) 80 + 81 + def __repr__(self): 82 + name = self.to_name(self.kind) 83 + if isinstance(self.value, str): 84 + value = '"' + self.value + '"' 85 + else: 86 + value = self.value 87 + 88 + return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 89 + 90 + #: Regexes to parse C code, transforming it into tokens. 91 + RE_SCANNER_LIST = [ 92 + # 93 + # Note that \s\S is different than .*, as it also catches \n 94 + # 95 + (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 96 + 97 + (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 98 + (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 99 + 100 + (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 101 + r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 102 + 103 + (CToken.ENDSTMT, r"(?:\s+;|;)"), 104 + 105 + (CToken.PUNC, r"[,\.]"), 106 + 107 + (CToken.BEGIN, r"[\[\(\{]"), 108 + 109 + (CToken.END, r"[\]\)\}]"), 110 + 111 + (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 112 + 113 + (CToken.HASH, r"#"), 114 + 115 + (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 116 + r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 117 + 118 + (CToken.STRUCT, r"\bstruct\b"), 119 + (CToken.UNION, r"\bunion\b"), 120 + (CToken.ENUM, r"\benum\b"), 121 + (CToken.TYPEDEF, r"\btypedef\b"), 122 + 123 + (CToken.NAME, r"[A-Za-z_]\w*"), 124 + 125 + (CToken.SPACE, r"\s+"), 126 + 127 + (CToken.BACKREF, r"\\\d+"), 128 + 129 + (CToken.MISMATCH,r"."), 130 + ] 131 + 132 + def fill_re_scanner(token_list): 133 + """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 134 + re_tokens = [] 135 + 136 + for kind, pattern in token_list: 137 + name = CToken.to_name(kind) 138 + re_tokens.append(f"(?P<{name}>{pattern})") 139 + 140 + return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 141 + 142 + #: Handle C continuation lines. 143 + RE_CONT = KernRe(r"\\\n") 144 + 145 + RE_COMMENT_START = KernRe(r'/\*\s*') 146 + 147 + #: tokenizer regex. Will be filled at the first CTokenizer usage. 148 + RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 149 + 150 + 151 + class CTokenizer(): 152 + """ 153 + Scan C statements and definitions and produce tokens. 154 + 155 + When converted to string, it drops comments and handle public/private 156 + values, respecting depth. 157 + """ 158 + 159 + # This class is inspired and follows the basic concepts of: 160 + # https://docs.python.org/3/library/re.html#writing-a-tokenizer 161 + 162 + def __init__(self, source=None, log=None): 163 + """ 164 + Create a regular expression to handle RE_SCANNER_LIST. 165 + 166 + While I generally don't like using regex group naming via: 167 + (?P<name>...) 168 + 169 + in this particular case, it makes sense, as we can pick the name 170 + when matching a code via RE_SCANNER. 171 + """ 172 + 173 + self.tokens = [] 174 + 175 + if not source: 176 + return 177 + 178 + if isinstance(source, list): 179 + self.tokens = source 180 + return 181 + 182 + # 183 + # While we could just use _tokenize directly via interator, 184 + # As we'll need to use the tokenizer several times inside kernel-doc 185 + # to handle macro transforms, cache the results on a list, as 186 + # re-using it is cheaper than having to parse everytime. 187 + # 188 + for tok in self._tokenize(source): 189 + self.tokens.append(tok) 190 + 191 + def _tokenize(self, source): 192 + """ 193 + Iterator that parses ``source``, splitting it into tokens, as defined 194 + at ``self.RE_SCANNER_LIST``. 195 + 196 + The interactor returns a CToken class object. 197 + """ 198 + 199 + # Handle continuation lines. Note that kdoc_parser already has a 200 + # logic to do that. Still, let's keep it for completeness, as we might 201 + # end re-using this tokenizer outsize kernel-doc some day - or we may 202 + # eventually remove from there as a future cleanup. 203 + source = RE_CONT.sub("", source) 204 + 205 + brace_level = 0 206 + paren_level = 0 207 + bracket_level = 0 208 + 209 + for match in RE_SCANNER.finditer(source): 210 + kind = CToken.from_name(match.lastgroup) 211 + pos = match.start() 212 + value = match.group() 213 + 214 + if kind == CToken.MISMATCH: 215 + log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 216 + elif kind == CToken.BEGIN: 217 + if value == '(': 218 + paren_level += 1 219 + elif value == '[': 220 + bracket_level += 1 221 + else: # value == '{' 222 + brace_level += 1 223 + 224 + elif kind == CToken.END: 225 + if value == ')' and paren_level > 0: 226 + paren_level -= 1 227 + elif value == ']' and bracket_level > 0: 228 + bracket_level -= 1 229 + elif brace_level > 0: # value == '}' 230 + brace_level -= 1 231 + 232 + yield CToken(kind, value, pos, 233 + brace_level, paren_level, bracket_level) 234 + 235 + def __str__(self): 236 + out="" 237 + show_stack = [True] 238 + 239 + for i, tok in enumerate(self.tokens): 240 + if tok.kind == CToken.BEGIN: 241 + show_stack.append(show_stack[-1]) 242 + 243 + elif tok.kind == CToken.END: 244 + prev = show_stack[-1] 245 + if len(show_stack) > 1: 246 + show_stack.pop() 247 + 248 + if not prev and show_stack[-1]: 249 + # 250 + # Try to preserve indent 251 + # 252 + out += "\t" * (len(show_stack) - 1) 253 + 254 + out += str(tok.value) 255 + continue 256 + 257 + elif tok.kind == CToken.COMMENT: 258 + comment = RE_COMMENT_START.sub("", tok.value) 259 + 260 + if comment.startswith("private:"): 261 + show_stack[-1] = False 262 + show = False 263 + elif comment.startswith("public:"): 264 + show_stack[-1] = True 265 + 266 + continue 267 + 268 + if not show_stack[-1]: 269 + continue 270 + 271 + if i < len(self.tokens) - 1: 272 + next_tok = self.tokens[i + 1] 273 + 274 + # Do some cleanups before ";" 275 + 276 + if (tok.kind == CToken.SPACE and 277 + next_tok.kind == CToken.PUNC and 278 + next_tok.value == ";"): 279 + 280 + continue 281 + 282 + if (tok.kind == CToken.PUNC and 283 + next_tok.kind == CToken.PUNC and 284 + tok.value == ";" and 285 + next_tok.kind == CToken.PUNC and 286 + next_tok.value == ";"): 287 + 288 + continue 289 + 290 + out += str(tok.value) 291 + 292 + return out