Add safeparse to string udfs (#125) · roost.tools/osprey@fa12485

+36 -8

2 changed files

expand all

osprey_worker

src

osprey

engine

stdlib

udfs

string.py

tests

test_strings.py

+30 -8

osprey_worker/src/osprey/engine/stdlib/udfs/string.py

··· 354 354 return s 355 355 356 356 357 + def _safe_urlparse(url: str) -> Optional[ParseResult]: 358 + """Safely parse a URL, returning None for malformed URLs (e.g., invalid IPv6).""" 359 + try: 360 + return urlparse(url) 361 + except ValueError: 362 + # urlparse raises ValueError for malformed URLs like invalid IPv6 addresses 363 + return None 364 + 365 + 357 366 class StringExtractDomains(UDFBase[StringArguments, List[str]]): 358 367 """ 359 368 Used to extract a list of potential URL domains from a string of tokens. Returns a list ··· 366 375 def execute(self, execution_context: ExecutionContext, arguments: StringArguments) -> List[str]: 367 376 # split the message into individual tokens as based on a modified URL regex from messages_common. 368 377 # should capture space based links and markdown based links without duplication. 369 - potential_urls: Iterator[ParseResult] = ( 370 - urlparse(token) for token in re.findall('(https?:\/\/[^\/\s][^\s\)>]+)', arguments.s) 378 + potential_urls: Iterator[Optional[ParseResult]] = ( 379 + _safe_urlparse(token) for token in re.findall('(https?:\/\/[^\/\s][^\s\)>]+)', arguments.s) 371 380 ) 372 381 373 - # filter out any tokens that do not have a scheme or a domain 374 - valid_domains: Set[str] = set(url.netloc.split(':')[0] for url in potential_urls if url.scheme and url.netloc) 382 + # filter out any tokens that do not have a scheme or a domain (or failed to parse) 383 + def extract_host(netloc: str) -> str: 384 + # IPv6 addresses are enclosed in brackets, e.g. [::1]:8080 385 + if netloc.startswith('['): 386 + bracket_end = netloc.find(']') 387 + if bracket_end != -1: 388 + return netloc[: bracket_end + 1] 389 + # Regular hostname:port - split on colon to strip port 390 + return netloc.split(':')[0] 391 + 392 + valid_domains: Set[str] = set( 393 + extract_host(url.netloc) for url in potential_urls if url is not None and url.scheme and url.netloc 394 + ) 375 395 376 396 # return any valid domains encountered in the message 377 397 return list(valid_domains) ··· 389 409 def execute(self, execution_context: ExecutionContext, arguments: StringArguments) -> List[str]: 390 410 # split the message into individual tokens as based on a modified URL regex from messages_common. 391 411 # should capture space based links and markdown based links without duplication. 392 - potential_urls: Iterator[ParseResult] = ( 393 - urlparse(token) for token in re.findall('(https?:\/\/[^\/\s][^\s\)>]+)', arguments.s) 412 + potential_urls: Iterator[Optional[ParseResult]] = ( 413 + _safe_urlparse(token) for token in re.findall('(https?:\/\/[^\/\s][^\s\)>]+)', arguments.s) 394 414 ) 395 415 396 - # filter out any tokens that do not have a scheme or a domain 416 + # filter out any tokens that do not have a scheme or a domain (or failed to parse) 397 417 valid_urls: Set[str] = set( 398 - urlunparse(parsed_url) for parsed_url in potential_urls if parsed_url.scheme and parsed_url.netloc 418 + urlunparse(parsed_url) 419 + for parsed_url in potential_urls 420 + if parsed_url is not None and parsed_url.scheme and parsed_url.netloc 399 421 ) 400 422 401 423 # return any valid urls encountered in the message

osprey_worker/src/osprey/engine/stdlib/udfs/tests/test_strings.py

··· 261 261 ), 262 262 (f'https:/{QUICK_BROWN_FOX_DOMAIN_1}', []), # invalid url 263 263 (f'https:///{QUICK_BROWN_FOX_DOMAIN_1}', []), # invalid url 264 + ('https://[::1]:8080/path', ['[::1]']), # valid IPv6 URL 265 + ('https://[invalid', []), # invalid IPv6 URL (unclosed bracket) - should not raise ValueError 266 + ('check this out https://[bad::ipv6 click here', []), # malformed IPv6 in text 264 267 ], 265 268 ) 266 269 def test_extract_domains(execute: ExecuteFunction, text: str, expected_result: List[str]) -> None: ··· 307 310 ), 308 311 (f'https:/{QUICK_BROWN_FOX_DOMAIN_1}', []), # invalid url 309 312 (f'https:///{QUICK_BROWN_FOX_DOMAIN_1}', []), # invalid url 313 + ('https://[::1]:8080/path', ['https://[::1]:8080/path']), # valid IPv6 URL 314 + ('https://[invalid', []), # invalid IPv6 URL (unclosed bracket) - should not raise ValueError 315 + ('check this out https://[bad::ipv6 click here', []), # malformed IPv6 in text 310 316 ], 311 317 ) 312 318 def test_extract_urls(execute: ExecuteFunction, text: str, expected_result: List[str]) -> None:

Configure Feed

Configure Feed