1
0
mirror of https://github.com/jfilter/clean-text.git synced 2021-09-19 22:32:58 +03:00

make it possible to replace punctuations (close #12)

This commit is contained in:
Johannes Filter
2020-10-17 21:47:17 +02:00
parent d364b3e2a9
commit 8a688b8d1f
3 changed files with 34 additions and 4 deletions

View File

@@ -55,7 +55,8 @@ clean("some input",
no_numbers=False, # replace all numbers with a special token
no_digits=False, # replace all digits with a special token
no_currency_symbols=False, # replace all currency symbols with a special token
no_punct=False, # fully remove punctuation
no_punct=False, # remove punctuations
replace_with_punct="", # instead of removing punctuations you may replace them
replace_with_url="<URL>",
replace_with_email="<EMAIL>",
replace_with_phone_number="<PHONE>",

View File

@@ -4,6 +4,9 @@ Clean your text to create normalized text represenations.
import logging
import re
import sys
from unicodedata import category
from ftfy import fix_text
@@ -147,6 +150,15 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
return constants.CURRENCY_REGEX.sub(replace_with, text)
def replace_punct(text, replace_with=" "):
return text.translate(
dict.fromkeys(
(i for i in range(sys.maxunicode) if category(chr(i)).startswith("P")),
replace_with,
)
)
def remove_punct(text):
"""
Replace punctuations from ``text`` with whitespaces.
@@ -178,6 +190,7 @@ def clean(
replace_with_number="<NUMBER>",
replace_with_digit="0",
replace_with_currency_symbol="<CUR>",
replace_with_punct="",
lang="en",
):
"""
@@ -207,6 +220,7 @@ def clean(
replace_with_number (str): special NUMBER token, default "<NUMBER>",
replace_with_digit (str): special DIGIT token, default "0",
replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
replace_with_punct (str): replace punctuations with this token, default "",
lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
Returns:
@@ -238,7 +252,10 @@ def clean(
if no_digits:
text = replace_digits(text, replace_with_digit)
if no_punct:
text = remove_punct(text)
if replace_with_punct == "":
text = remove_punct(text)
else:
text = replace_punct(text, replace_with_punct)
if lower:
text = text.lower()

View File

@@ -108,6 +108,12 @@ def test_remove_punct():
assert cleantext.remove_punct(text) == proc_text
def test_replace_punct():
text = "I can't. No, I won't!"
proc_text = "i can t no i won t"
assert cleantext.clean(text, no_punct=True, replace_with_punct=" ") == proc_text
def test_replace_currency_symbols():
tests = [
(
@@ -157,5 +163,11 @@ def test_to_ascii():
def test_whitespace():
assert cleantext.clean(" peter", normalize_whitespace=False) == " peter"
assert cleantext.clean(" peter", normalize_whitespace=True) == "peter"
assert cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True) == "pet er"
assert cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False) == "pet\ner"
assert (
cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True)
== "pet er"
)
assert (
cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False)
== "pet\ner"
)