mirror of
https://github.com/jfilter/clean-text.git
synced 2021-09-19 22:32:58 +03:00
add option to keep 2 newlines + strip lines
This commit is contained in:
@@ -94,7 +94,9 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):
|
||||
return text
|
||||
|
||||
|
||||
def normalize_whitespace(text, no_line_breaks=False):
|
||||
def normalize_whitespace(
|
||||
text, no_line_breaks=False, strip_lines=True, keep_two_line_breaks=False
|
||||
):
|
||||
"""
|
||||
Given ``text`` str, replace one or more spacings with a single space, and one
|
||||
or more line breaks with a single newline. Also strip leading/trailing whitespace.
|
||||
@@ -102,9 +104,17 @@ def normalize_whitespace(text, no_line_breaks=False):
|
||||
if no_line_breaks:
|
||||
text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
|
||||
else:
|
||||
text = constants.NONBREAKING_SPACE_REGEX.sub(
|
||||
" ", constants.LINEBREAK_REGEX.sub(r"\n", text)
|
||||
)
|
||||
if keep_two_line_breaks:
|
||||
text = constants.NONBREAKING_SPACE_REGEX.sub(
|
||||
" ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text)
|
||||
)
|
||||
else:
|
||||
text = constants.NONBREAKING_SPACE_REGEX.sub(
|
||||
" ", constants.LINEBREAK_REGEX.sub(r"\n", text)
|
||||
)
|
||||
if strip_lines:
|
||||
text = "\n".join([x.strip() for x in text.splitlines()])
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
@@ -192,6 +202,8 @@ def clean(
|
||||
lower=True,
|
||||
normalize_whitespace=True,
|
||||
no_line_breaks=False,
|
||||
strip_lines=True,
|
||||
keep_two_line_breaks=False,
|
||||
no_urls=False,
|
||||
no_emails=False,
|
||||
no_phone_numbers=False,
|
||||
@@ -280,6 +292,6 @@ def clean(
|
||||
text = text.lower()
|
||||
|
||||
if normalize_whitespace:
|
||||
text = _normalize_whitespace(text, no_line_breaks)
|
||||
text = _normalize_whitespace(text, no_line_breaks, strip_lines, keep_two_line_breaks)
|
||||
|
||||
return text
|
||||
|
||||
@@ -53,6 +53,7 @@ NUMBERS_REGEX = re.compile(
|
||||
)
|
||||
|
||||
LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
|
||||
TWO_LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+((\r\n)|[\n\v])+")
|
||||
MULTI_WHITESPACE_TO_ONE_REGEX = re.compile(r"\s+")
|
||||
NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
|
||||
|
||||
|
||||
@@ -173,14 +173,81 @@ def test_whitespace():
|
||||
)
|
||||
|
||||
|
||||
emoji_line = "🤔 🙈 me, se 😌 ds 💕👭👙 hello 👩🏾🎓 emoji hello 👨👩👦👦 how are 😊 you today🙅🏽🙅🏽"
|
||||
emoji_line = (
|
||||
"🤔 🙈 me, se 😌 ds 💕👭👙 hello 👩🏾🎓 emoji hello 👨👩👦👦 how are 😊 you today🙅🏽🙅🏽"
|
||||
)
|
||||
|
||||
|
||||
def test_keep_emojis():
|
||||
assert cleantext.clean(emoji_line) == emoji_line
|
||||
|
||||
|
||||
def test_remove_emojis():
|
||||
assert cleantext.clean(emoji_line, no_emoji=True) == "me, se ds hello emoji hello how are you today"
|
||||
assert (
|
||||
cleantext.clean(emoji_line, no_emoji=True)
|
||||
== "me, se ds hello emoji hello how are you today"
|
||||
)
|
||||
|
||||
|
||||
def test_remove_emojis_no_ascii():
|
||||
assert cleantext.clean("😊 you today🙅🏽🙅🏽", to_ascii=False, no_emoji=True) == "you today"
|
||||
assert (
|
||||
cleantext.clean("😊 you today🙅🏽🙅🏽", to_ascii=False, no_emoji=True) == "you today"
|
||||
)
|
||||
|
||||
|
||||
def test_remove_trail_leading_whitespace():
|
||||
text_input = """
|
||||
Sehr geehrte Damen und Herren,
|
||||
|
||||
ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
|
||||
|
||||
Der Fotoautomat steht in 19061 Berlin.
|
||||
|
||||
|
||||
|
||||
Marke: Fotofix
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Mit freundlichen Grüßen,
|
||||
Johannes dfdfd
|
||||
"""
|
||||
|
||||
text_output = """Sehr geehrte Damen und Herren,
|
||||
|
||||
ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
|
||||
|
||||
Der Fotoautomat steht in 19061 Berlin.
|
||||
|
||||
Marke: Fotofix
|
||||
|
||||
Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
|
||||
|
||||
Mit freundlichen Grüßen,
|
||||
Johannes dfdfd"""
|
||||
|
||||
print(
|
||||
cleantext.clean(
|
||||
text_input,
|
||||
lower=False,
|
||||
lang="de",
|
||||
no_line_breaks=False,
|
||||
keep_two_line_breaks=True,
|
||||
)
|
||||
)
|
||||
|
||||
assert text_output == cleantext.clean(
|
||||
text_input,
|
||||
lower=False,
|
||||
lang="de",
|
||||
no_line_breaks=False,
|
||||
keep_two_line_breaks=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user