mirror of
https://github.com/jfilter/clean-text.git
synced 2021-09-19 22:32:58 +03:00
minor improvements
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# `clean-text` [](https://travis-ci.com/jfilter/clean-text) [](https://pypi.org/project/clean-text/) [](https://pypi.org/project/clean-text/)
|
||||
# `clean-text` [](https://travis-ci.com/jfilter/clean-text) [](https://pypi.org/project/clean-text/) [](https://pypi.org/project/clean-text/) [](https://pypistats.org/packages/clean-text)
|
||||
|
||||
Data on the Web and elsewhere is often dirty. Clean your text with `clean-text` to create normalized text representations. For instance, turn this corrupted input:
|
||||
User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:
|
||||
|
||||
```txt
|
||||
A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
|
||||
@@ -88,6 +88,9 @@ If you don't like the output of `clean-text`, consider adding a [test](https://g
|
||||
|
||||
- https://github.com/pudo/normality
|
||||
- https://github.com/davidmogar/cucco
|
||||
- https://github.com/lyeoni/prenlp
|
||||
- https://github.com/chartbeat-labs/textacy
|
||||
- https://github.com/jbesomi/texthero
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
@@ -99,6 +102,6 @@ Apache
|
||||
|
||||
## Sponsoring
|
||||
|
||||
This work was created as part of a [project](https://github.com/jfilter/ptf) that was funded by the German [Federal Ministry of Education and Research](https://www.bmbf.de/en/index.html).
|
||||
This work was created as part of a [project](https://github.com/jfilter/ptf-kommentare) that was funded by the German [Federal Ministry of Education and Research](https://www.bmbf.de/en/index.html).
|
||||
|
||||
<img src="./bmbf_funded.svg">
|
||||
|
||||
@@ -4,7 +4,6 @@ Clean your text to create normalized text represenations.
|
||||
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
from ftfy import fix_text
|
||||
|
||||
@@ -212,7 +211,7 @@ def clean(
|
||||
"""
|
||||
|
||||
if text is None:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
text = str(text)
|
||||
|
||||
|
||||
@@ -9,6 +9,8 @@ classifiers = [
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
]
|
||||
packages = [
|
||||
|
||||
@@ -74,9 +74,7 @@ def test_replace_currency_symbols():
|
||||
|
||||
|
||||
def test_fix_bad_unicode():
|
||||
text = (
|
||||
"and install a \\u2018new\\u2019 society in their"
|
||||
) # and install a ‘new’ society in their
|
||||
text = "and install a \\u2018new\\u2019 society in their" # and install a ‘new’ society in their
|
||||
assert cleantext.fix_bad_unicode(text) == "and install a 'new' society in their"
|
||||
|
||||
|
||||
@@ -97,4 +95,5 @@ def test_zero_digits():
|
||||
def test_to_ascii():
|
||||
assert cleantext.to_ascii_unicode("whatéver") == "whatever"
|
||||
assert cleantext.to_ascii_unicode("Äpfel»", lang="de") == 'Äpfel"'
|
||||
assert cleantext.to_ascii_unicode("Äpfel»", lang="DE") == 'Äpfel"'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user