docs: kdoc_re: get rid of NestedMatch class

-201

1 changed file

expand all

tools

lib

python

kdoc

kdoc_re.py

-201

tools/lib/python/kdoc/kdoc_re.py

··· 140 140 """ 141 141 142 142 return self.last_match.groups() 143 - 144 - #: Nested delimited pairs (brackets and parenthesis) 145 - DELIMITER_PAIRS = { 146 - '{': '}', 147 - '(': ')', 148 - '[': ']', 149 - } 150 - 151 - #: compiled delimiters 152 - RE_DELIM = KernRe(r'[\{\}\[\]\(\)]') 153 - 154 - 155 - class NestedMatch: 156 - """ 157 - Finding nested delimiters is hard with regular expressions. It is 158 - even harder on Python with its normal re module, as there are several 159 - advanced regular expressions that are missing. 160 - 161 - This is the case of this pattern:: 162 - 163 - '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 164 - 165 - which is used to properly match open/close parentheses of the 166 - string search STRUCT_GROUP(), 167 - 168 - Add a class that counts pairs of delimiters, using it to match and 169 - replace nested expressions. 170 - 171 - The original approach was suggested by: 172 - 173 - https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 174 - 175 - Although I re-implemented it to make it more generic and match 3 types 176 - of delimiters. The logic checks if delimiters are paired. If not, it 177 - will ignore the search string. 178 - """ 179 - 180 - # TODO: make NestedMatch handle multiple match groups 181 - # 182 - # Right now, regular expressions to match it are defined only up to 183 - # the start delimiter, e.g.: 184 - # 185 - # \bSTRUCT_GROUP\( 186 - # 187 - # is similar to: STRUCT_GROUP\((.*)\) 188 - # except that the content inside the match group is delimiter-aligned. 189 - # 190 - # The content inside parentheses is converted into a single replace 191 - # group (e.g. r`\0'). 192 - # 193 - # It would be nice to change such definition to support multiple 194 - # match groups, allowing a regex equivalent to: 195 - # 196 - # FOO\((.*), (.*), (.*)\) 197 - # 198 - # it is probably easier to define it not as a regular expression, but 199 - # with some lexical definition like: 200 - # 201 - # FOO(arg1, arg2, arg3) 202 - 203 - def __init__(self, regex): 204 - self.regex = KernRe(regex) 205 - 206 - def _search(self, line): 207 - """ 208 - Finds paired blocks for a regex that ends with a delimiter. 209 - 210 - The suggestion of using finditer to match pairs came from: 211 - https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 212 - but I ended using a different implementation to align all three types 213 - of delimiters and seek for an initial regular expression. 214 - 215 - The algorithm seeks for open/close paired delimiters and places them 216 - into a stack, yielding a start/stop position of each match when the 217 - stack is zeroed. 218 - 219 - The algorithm should work fine for properly paired lines, but will 220 - silently ignore end delimiters that precede a start delimiter. 221 - This should be OK for kernel-doc parser, as unaligned delimiters 222 - would cause compilation errors. So, we don't need to raise exceptions 223 - to cover such issues. 224 - """ 225 - 226 - stack = [] 227 - 228 - for match_re in self.regex.finditer(line): 229 - start = match_re.start() 230 - offset = match_re.end() 231 - string_char = None 232 - escape = False 233 - 234 - d = line[offset - 1] 235 - if d not in DELIMITER_PAIRS: 236 - continue 237 - 238 - end = DELIMITER_PAIRS[d] 239 - stack.append(end) 240 - 241 - for match in RE_DELIM.finditer(line[offset:]): 242 - pos = match.start() + offset 243 - 244 - d = line[pos] 245 - 246 - if escape: 247 - escape = False 248 - continue 249 - 250 - if string_char: 251 - if d == '\\': 252 - escape = True 253 - elif d == string_char: 254 - string_char = None 255 - 256 - continue 257 - 258 - if d in ('"', "'"): 259 - string_char = d 260 - continue 261 - 262 - if d in DELIMITER_PAIRS: 263 - end = DELIMITER_PAIRS[d] 264 - 265 - stack.append(end) 266 - continue 267 - 268 - # Does the end delimiter match what is expected? 269 - if stack and d == stack[-1]: 270 - stack.pop() 271 - 272 - if not stack: 273 - yield start, offset, pos + 1 274 - break 275 - 276 - def search(self, line): 277 - """ 278 - This is similar to re.search: 279 - 280 - It matches a regex that it is followed by a delimiter, 281 - returning occurrences only if all delimiters are paired. 282 - """ 283 - 284 - for t in self._search(line): 285 - 286 - yield line[t[0]:t[2]] 287 - 288 - def sub(self, sub, line, count=0): 289 - """ 290 - This is similar to re.sub: 291 - 292 - It matches a regex that it is followed by a delimiter, 293 - replacing occurrences only if all delimiters are paired. 294 - 295 - if the sub argument contains:: 296 - 297 - r'\0' 298 - 299 - it will work just like re: it places there the matched paired data 300 - with the delimiter stripped. 301 - 302 - If count is different than zero, it will replace at most count 303 - items. 304 - """ 305 - out = "" 306 - 307 - cur_pos = 0 308 - n = 0 309 - 310 - for start, end, pos in self._search(line): 311 - out += line[cur_pos:start] 312 - 313 - # Value, ignoring start/end delimiters 314 - value = line[end:pos - 1] 315 - 316 - # replaces \0 at the sub string, if \0 is used there 317 - new_sub = sub 318 - new_sub = new_sub.replace(r'\0', value) 319 - 320 - out += new_sub 321 - 322 - # Drop end ';' if any 323 - if pos < len(line) and line[pos] == ';': 324 - pos += 1 325 - 326 - cur_pos = pos 327 - n += 1 328 - 329 - if count and count >= n: 330 - break 331 - 332 - # Append the remaining string 333 - l = len(line) 334 - out += line[cur_pos:l] 335 - 336 - return out 337 - 338 - def __repr__(self): 339 - """ 340 - Returns a displayable version of the class init. 341 - """ 342 - 343 - return f'NestedMatch("{self.regex.regex.pattern}")'

Configure Feed

Configure Feed