-
Notifications
You must be signed in to change notification settings - Fork 265
fix(filters): escape - in regex_url character class (#1601)
#1880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
9dbc439
9abd3fe
4004624
5661033
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -555,6 +555,59 @@ def test_urls(self) -> None: | |
| ) | ||
| assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" | ||
|
|
||
| def test_url_regex_does_not_swallow_html_tags(self) -> None: | ||
| # Regression for #1601: previously the `[$-_…]` range matched `<`, | ||
| # `>`, `;`, `:`, etc., so a URL match bled past the URL into the | ||
| # surrounding HTML/punctuation. | ||
| from nemo_curator.stages.text.utils.constants import regex_url | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this import to the top of the file instead? |
||
|
|
||
| assert regex_url.findall("see http://x.com<bad> for details") == ["http://x.com"] | ||
| assert regex_url.findall("click http://example.com;next") == ["http://example.com"] | ||
|
|
||
| def test_url_regex_still_matches_allowed_characters(self) -> None: | ||
| # The fix must not regress on characters the original character | ||
| # class intended to allow: letters, digits, `$`, `_`, `@`, `.`, | ||
| # `&`, `+`, `-`, `!`, `*`, `(`, `)`, `,`, and percent-encoded escapes. | ||
| from nemo_curator.stages.text.utils.constants import regex_url | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above. |
||
|
|
||
| text = "ref https://A.B-C_D+E&f!*(g),h%2F end" | ||
|
|
||
| assert regex_url.findall(text) == ["https://A.B-C_D+E&f!*(g),h%2F"] | ||
|
|
||
| def test_urls_filter_accepts_custom_regex(self) -> None: | ||
| # Per the discussion on #1601, the URL regex should be | ||
| # customizable on the filter so callers can swap in a stricter or | ||
| # looser pattern (e.g. `r"https?://[^\s]+"`). | ||
| dataset = list_to_dataset( | ||
| [ | ||
| "ftp://files.example.com/archive.tar.gz", | ||
| "no urls here!", | ||
| "https://www.nvidia.com/en-us/", | ||
| ] | ||
| ) | ||
| # Custom regex matches `ftp://` URLs that the default does not. | ||
| filters = ScoreFilter(UrlsFilter(url_regex=r"ftp://[^\s]+")) | ||
|
|
||
| filtered_data = filters.process(dataset) | ||
|
|
||
| expected_data = DocumentBatch( | ||
| data=pd.DataFrame({"text": ["no urls here!", "https://www.nvidia.com/en-us/"]}), | ||
| task_id="batch_1_urls_ratio", | ||
| dataset_name="test_1", | ||
| ) | ||
| assert all_equal(expected_data, filtered_data), f"Expected {expected_data} but got {filtered_data}" | ||
|
|
||
| def test_urls_filter_accepts_compiled_pattern(self) -> None: | ||
| # The custom regex argument should also accept a pre-compiled | ||
| # `re.Pattern` instance, not just a string. | ||
| import re | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above. |
||
|
|
||
| compiled = re.compile(r"https?://[^\s]+") | ||
| urls_filter = UrlsFilter(url_regex=compiled) | ||
|
|
||
| # The constructor stores the same compiled object, not a re-compile. | ||
| assert urls_filter._url_regex is compiled | ||
|
|
||
| def test_bullets(self) -> None: | ||
| dataset = list_to_dataset( | ||
| [ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove link please.