add option to keep 2 newlines + strip lines

2021-09-19 22:32:58 +03:00 · 2021-02-15 23:20:03 +01:00
parent ff88579088
commit 1c15db66c9
3 changed files with 88 additions and 8 deletions
--- a/cleantext/clean.py
+++ b/cleantext/clean.py
@@ -94,7 +94,9 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):
    return text


-def normalize_whitespace(text, no_line_breaks=False):
+def normalize_whitespace(
+    text, no_line_breaks=False, strip_lines=True, keep_two_line_breaks=False
+):
    """
    Given ``text`` str, replace one or more spacings with a single space, and one
    or more line breaks with a single newline. Also strip leading/trailing whitespace.
@@ -102,9 +104,17 @@ def normalize_whitespace(text, no_line_breaks=False):
    if no_line_breaks:
        text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
    else:
-        text = constants.NONBREAKING_SPACE_REGEX.sub(
-            " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
-        )
+        if keep_two_line_breaks:
+            text = constants.NONBREAKING_SPACE_REGEX.sub(
+                " ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text)
+            )
+        else:
+            text = constants.NONBREAKING_SPACE_REGEX.sub(
+                " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
+            )
+    if strip_lines:
+        text = "\n".join([x.strip() for x in text.splitlines()])
+
    return text.strip()


@@ -192,6 +202,8 @@ def clean(
    lower=True,
    normalize_whitespace=True,
    no_line_breaks=False,
+    strip_lines=True,
+    keep_two_line_breaks=False,
    no_urls=False,
    no_emails=False,
    no_phone_numbers=False,
@@ -280,6 +292,6 @@ def clean(
        text = text.lower()

    if normalize_whitespace:
-        text = _normalize_whitespace(text, no_line_breaks)
+        text = _normalize_whitespace(text, no_line_breaks, strip_lines, keep_two_line_breaks)

    return text
--- a/cleantext/constants.py
+++ b/cleantext/constants.py
@@ -53,6 +53,7 @@ NUMBERS_REGEX = re.compile(
 )

 LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
+TWO_LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+((\r\n)|[\n\v])+")
 MULTI_WHITESPACE_TO_ONE_REGEX = re.compile(r"\s+")
 NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")

--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -173,14 +173,81 @@ def test_whitespace():
    )


-emoji_line = "🤔 🙈 me, se 😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
+emoji_line = (
+    "🤔 🙈 me, se 😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
+)
+

 def test_keep_emojis():
    assert cleantext.clean(emoji_line) == emoji_line


 def test_remove_emojis():
-    assert cleantext.clean(emoji_line, no_emoji=True) == "me, se ds hello emoji hello how are you today"
+    assert (
+        cleantext.clean(emoji_line, no_emoji=True)
+        == "me, se ds hello emoji hello how are you today"
+    )
+

 def test_remove_emojis_no_ascii():
-    assert cleantext.clean("😊 you today🙅🏽🙅🏽", to_ascii=False, no_emoji=True) == "you today"
+    assert (
+        cleantext.clean("😊 you today🙅🏽🙅🏽", to_ascii=False, no_emoji=True) == "you today"
+    )
+
+
+def test_remove_trail_leading_whitespace():
+    text_input = """
+    Sehr geehrte Damen und Herren,
+
+ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
+
+Der Fotoautomat steht in  19061  Berlin.
+
+
+
+		Marke: Fotofix
+
+
+
+
+
+		Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
+
+
+
+
+
+Mit freundlichen Grüßen,
+Johannes dfdfd
+    """
+
+    text_output = """Sehr geehrte Damen und Herren,
+
+ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
+
+Der Fotoautomat steht in 19061 Berlin.
+
+Marke: Fotofix
+
+Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
+
+Mit freundlichen Grüßen,
+Johannes dfdfd"""
+
+    print(
+        cleantext.clean(
+            text_input,
+            lower=False,
+            lang="de",
+            no_line_breaks=False,
+            keep_two_line_breaks=True,
+        )
+    )
+
+    assert text_output == cleantext.clean(
+        text_input,
+        lower=False,
+        lang="de",
+        no_line_breaks=False,
+        keep_two_line_breaks=True,
+    )