mirror of
https://github.com/jfilter/clean-text.git
synced 2021-09-19 22:32:58 +03:00
make it possible to replace punctuations (close #12)
This commit is contained in:
@@ -55,7 +55,8 @@ clean("some input",
|
||||
no_numbers=False, # replace all numbers with a special token
|
||||
no_digits=False, # replace all digits with a special token
|
||||
no_currency_symbols=False, # replace all currency symbols with a special token
|
||||
no_punct=False, # fully remove punctuation
|
||||
no_punct=False, # remove punctuations
|
||||
replace_with_punct="", # instead of removing punctuations you may replace them
|
||||
replace_with_url="<URL>",
|
||||
replace_with_email="<EMAIL>",
|
||||
replace_with_phone_number="<PHONE>",
|
||||
|
||||
@@ -4,6 +4,9 @@ Clean your text to create normalized text represenations.
|
||||
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from unicodedata import category
|
||||
|
||||
|
||||
from ftfy import fix_text
|
||||
|
||||
@@ -147,6 +150,15 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
|
||||
return constants.CURRENCY_REGEX.sub(replace_with, text)
|
||||
|
||||
|
||||
def replace_punct(text, replace_with=" "):
|
||||
return text.translate(
|
||||
dict.fromkeys(
|
||||
(i for i in range(sys.maxunicode) if category(chr(i)).startswith("P")),
|
||||
replace_with,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def remove_punct(text):
|
||||
"""
|
||||
Replace punctuations from ``text`` with whitespaces.
|
||||
@@ -178,6 +190,7 @@ def clean(
|
||||
replace_with_number="<NUMBER>",
|
||||
replace_with_digit="0",
|
||||
replace_with_currency_symbol="<CUR>",
|
||||
replace_with_punct="",
|
||||
lang="en",
|
||||
):
|
||||
"""
|
||||
@@ -207,6 +220,7 @@ def clean(
|
||||
replace_with_number (str): special NUMBER token, default "<NUMBER>",
|
||||
replace_with_digit (str): special DIGIT token, default "0",
|
||||
replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
|
||||
replace_with_punct (str): replace punctuations with this token, default "",
|
||||
lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
|
||||
|
||||
Returns:
|
||||
@@ -238,7 +252,10 @@ def clean(
|
||||
if no_digits:
|
||||
text = replace_digits(text, replace_with_digit)
|
||||
if no_punct:
|
||||
text = remove_punct(text)
|
||||
if replace_with_punct == "":
|
||||
text = remove_punct(text)
|
||||
else:
|
||||
text = replace_punct(text, replace_with_punct)
|
||||
if lower:
|
||||
text = text.lower()
|
||||
|
||||
|
||||
@@ -108,6 +108,12 @@ def test_remove_punct():
|
||||
assert cleantext.remove_punct(text) == proc_text
|
||||
|
||||
|
||||
def test_replace_punct():
|
||||
text = "I can't. No, I won't!"
|
||||
proc_text = "i can t no i won t"
|
||||
assert cleantext.clean(text, no_punct=True, replace_with_punct=" ") == proc_text
|
||||
|
||||
|
||||
def test_replace_currency_symbols():
|
||||
tests = [
|
||||
(
|
||||
@@ -157,5 +163,11 @@ def test_to_ascii():
|
||||
def test_whitespace():
|
||||
assert cleantext.clean(" peter", normalize_whitespace=False) == " peter"
|
||||
assert cleantext.clean(" peter", normalize_whitespace=True) == "peter"
|
||||
assert cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True) == "pet er"
|
||||
assert cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False) == "pet\ner"
|
||||
assert (
|
||||
cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True)
|
||||
== "pet er"
|
||||
)
|
||||
assert (
|
||||
cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False)
|
||||
== "pet\ner"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user