fix wild whitespace bugs

2021-09-19 22:32:58 +03:00 · 2021-02-16 22:33:32 +01:00
parent 00cad45b1c
commit 90946c2cc8
2 changed files with 38 additions and 2 deletions
--- a/cleantext/clean.py
+++ b/cleantext/clean.py
@@ -101,6 +101,9 @@ def normalize_whitespace(
    Given ``text`` str, replace one or more spacings with a single space, and one
    or more line breaks with a single newline. Also strip leading/trailing whitespace.
    """
+    if strip_lines:
+        text = "\n".join([x.strip() for x in text.splitlines()])
+
    if no_line_breaks:
        text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
    else:
@@ -112,8 +115,6 @@ def normalize_whitespace(
            text = constants.NONBREAKING_SPACE_REGEX.sub(
                " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
            )
-    if strip_lines:
-        text = "\n".join([x.strip() for x in text.splitlines()])

    return text.strip()

--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -251,3 +251,38 @@ Johannes dfdfd"""
        no_line_breaks=False,
        keep_two_line_breaks=True,
    )
+
+def test_remove_trail_leading_whitespace():
+    text_input = b'Sehr geehrte Damen und Herren,\\r\\n\\r\\nich m\\xf6chte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten f\\xfcr biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).\\r\\n\\r\\nDer Fotoautomat steht in  .\\r\\n\\r\\n\\r\\n\\t\\r\\n\\t\\tOrt des Automats: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n \\r\\n\\t\\r\\n\\t\\tMarke: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nHier noch Text von Anna Lena.\\r\\n\\r\\nMit freundlichen Gr\\xfc\\xdfen'
+    text_input = text_input.decode('unicode_escape')
+    text_output = """Sehr geehrte Damen und Herren,
+
+ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
+
+Der Fotoautomat steht in .
+
+Ort des Automats:
+
+Marke:
+
+Hier noch Text von Anna Lena.
+
+Mit freundlichen Grüßen"""
+
+    print(
+        cleantext.clean(
+            text_input,
+            lower=False,
+            lang="de",
+            no_line_breaks=False,
+            keep_two_line_breaks=True,
+        )
+    )
+
+    assert text_output == cleantext.clean(
+        text_input,
+        lower=False,
+        lang="de",
+        no_line_breaks=False,
+        keep_two_line_breaks=True,
+    )