1
0
mirror of https://github.com/jfilter/clean-text.git synced 2021-09-19 22:32:58 +03:00
Files
clean-text-nlp-preprocessing/tests/test_clean.py
Johannes Filter d0c1eb6077 getting stuff done
2018-12-21 21:18:08 +01:00

89 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
import cleantext
def test_normalize_whitespace():
text = "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
assert cleantext.normalize_whitespace(text) == proc_text
def test_unpack_contractions():
text = "Y'all can't believe you're not who they've said I'll become, but shouldn't."
proc_text = "You all can not believe you are not who they have said I will become, but should not."
assert cleantext.unpack_contractions(text) == proc_text
def test_replace_urls():
text = "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom."
proc_text = "I learned everything I know from *URL* and *URL* and Mom."
assert cleantext.replace_urls(text, "*URL*") == proc_text
def test_replace_emails():
text = "I can be reached at username@example.com through next Friday."
proc_text = "I can be reached at *EMAIL* through next Friday."
assert cleantext.replace_emails(text, "*EMAIL*") == proc_text
def test_replace_phone_numbers():
text = "I can be reached at 555-123-4567 through next Friday."
proc_text = "I can be reached at *PHONE* through next Friday."
assert cleantext.replace_phone_numbers(text, "*PHONE*") == proc_text
def test_replace_numbers():
text = "I owe $1,000.99 to 123 people for 2 +1 reasons."
proc_text = "I owe $*NUM* to *NUM* people for *NUM* *NUM* reasons."
assert cleantext.replace_numbers(text, "*NUM*") == proc_text
def test_remove_punct():
text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
proc_text = "I can t No I won t It s a matter of principle of what s the word conscience "
assert cleantext.remove_punct(text) == proc_text
def test_remove_punct_marks():
text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
proc_text = "I can t. No, I won t! It s a matter of principle ; of what s the word? conscience."
assert cleantext.remove_punct(text, marks="-'\"") == proc_text
def test_replace_currency_symbols():
tests = [
(
"$1.00 equals £0.67 equals €0.91.",
"USD1.00 equals GBP0.67 equals EUR0.91.",
"*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.",
),
(
"this zebra costs $100.",
"this zebra costs USD100.",
"this zebra costs *CUR* 100.",
),
]
for text, proc_text1, proc_text2 in tests:
assert cleantext.replace_currency_symbols(text, replace_with=None) == proc_text1
assert (
cleantext.replace_currency_symbols(text, replace_with="*CUR* ")
== proc_text2
)
def test_remove_accents():
text = "El niño se asustó -- qué miedo!"
proc_text = "El nino se asusto -- que miedo!"
assert cleantext.remove_accents(text, method="unicode") == proc_text
assert cleantext.remove_accents(text, method="ascii") == proc_text
with pytest.raises(Exception):
_ = cleantext.remove_accents(text, method="foo")
def test_fix_unicode():
text = (
"and install a \\u2018new\\u2019 society in their"
) # and install a new society in their
assert cleantext.fix_bad_unicode(text) == "and install a 'new' society in their"