make it possible to replace punctuations (close #12)

2021-09-19 22:32:58 +03:00 · 2020-10-17 21:47:17 +02:00
parent d364b3e2a9
commit 8a688b8d1f
3 changed files with 34 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -55,7 +55,8 @@ clean("some input",
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
-    no_punct=False,                 # fully remove punctuation
+    no_punct=False,                 # remove punctuations
+    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
--- a/cleantext/clean.py
+++ b/cleantext/clean.py
@@ -4,6 +4,9 @@ Clean your text to create normalized text represenations.

 import logging
 import re
+import sys
+from unicodedata import category
+

 from ftfy import fix_text

@@ -147,6 +150,15 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
        return constants.CURRENCY_REGEX.sub(replace_with, text)


+def replace_punct(text, replace_with=" "):
+    return text.translate(
+        dict.fromkeys(
+            (i for i in range(sys.maxunicode) if category(chr(i)).startswith("P")),
+            replace_with,
+        )
+    )
+
+
 def remove_punct(text):
    """
    Replace punctuations from ``text`` with whitespaces.
@@ -178,6 +190,7 @@ def clean(
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
+    replace_with_punct="",
    lang="en",
 ):
    """
@@ -207,6 +220,7 @@ def clean(
        replace_with_number (str): special NUMBER token, default "<NUMBER>",
        replace_with_digit (str): special DIGIT token, default "0",
        replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
+        replace_with_punct (str): replace punctuations with this token, default "",
        lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported

    Returns:
@@ -238,7 +252,10 @@ def clean(
    if no_digits:
        text = replace_digits(text, replace_with_digit)
    if no_punct:
-        text = remove_punct(text)
+        if replace_with_punct == "":
+            text = remove_punct(text)
+        else:
+            text = replace_punct(text, replace_with_punct)
    if lower:
        text = text.lower()

--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -108,6 +108,12 @@ def test_remove_punct():
    assert cleantext.remove_punct(text) == proc_text


+def test_replace_punct():
+    text = "I can't. No, I won't!"
+    proc_text = "i can t no i won t"
+    assert cleantext.clean(text, no_punct=True, replace_with_punct=" ") == proc_text
+
+
 def test_replace_currency_symbols():
    tests = [
        (
@@ -157,5 +163,11 @@ def test_to_ascii():
 def test_whitespace():
    assert cleantext.clean(" peter", normalize_whitespace=False) == " peter"
    assert cleantext.clean(" peter", normalize_whitespace=True) == "peter"
-    assert cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True) == "pet er"
-    assert cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False) == "pet\ner"
+    assert (
+        cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True)
+        == "pet er"
+    )
+    assert (
+        cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False)
+        == "pet\ner"
+    )