mirror of
https://github.com/jfilter/clean-text.git
synced 2021-09-19 22:32:58 +03:00
getting stuff done
This commit is contained in:
17
.editorconfig
Normal file
17
.editorconfig
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# http://editorconfig.org
|
||||||
|
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
end_of_line = lf
|
||||||
|
charset = utf-8
|
||||||
|
|
||||||
|
[*.py]
|
||||||
|
max_line_length = 119
|
||||||
|
|
||||||
|
[*.md]
|
||||||
|
insert_final_newline = false
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -102,3 +102,4 @@ venv.bak/
|
|||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
.vscode
|
||||||
|
|||||||
31
LICENSE
31
LICENSE
@@ -1,21 +1,18 @@
|
|||||||
MIT License
|
Copyright 2016 Chartbeat, Inc.
|
||||||
|
|
||||||
Copyright (c) 2018 Johannes Filter
|
Modified by Johannes Fillter, 2018
|
||||||
|
- only use `constants.py`, `preprocess.py` (renamed to clean.py) and `compat.py`
|
||||||
|
- only use `test_preprocess` and renamed to `test_clean.py`
|
||||||
|
- modification to the code to add new features
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
you may not use this file except in compliance with the License.
|
||||||
in the Software without restriction, including without limitation the rights
|
You may obtain a copy of the License at
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
Unless required by applicable law or agreed to in writing, software
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
See the License for the specific language governing permissions and
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
limitations under the License.
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|||||||
17
Pipfile
Normal file
17
Pipfile
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[[source]]
|
||||||
|
name = "pypi"
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
pylint = "*"
|
||||||
|
black = "*"
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
cleantext = {editable = true,extras = ["gpl"],path = "."}
|
||||||
|
ftfy = "*"
|
||||||
|
unidecode = "*"
|
||||||
|
pytest = "*"
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.7"
|
||||||
179
Pipfile.lock
generated
Normal file
179
Pipfile.lock
generated
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
{
|
||||||
|
"_meta": {
|
||||||
|
"hash": {
|
||||||
|
"sha256": "933b2d6b2d62530ba230ca07596269006bc8c9d645c358603b0ebc8cda215f06"
|
||||||
|
},
|
||||||
|
"pipfile-spec": 6,
|
||||||
|
"requires": {
|
||||||
|
"python_version": "3.7"
|
||||||
|
},
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"name": "pypi",
|
||||||
|
"url": "https://pypi.org/simple",
|
||||||
|
"verify_ssl": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"atomicwrites": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
|
||||||
|
"sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
|
||||||
|
],
|
||||||
|
"version": "==1.2.1"
|
||||||
|
},
|
||||||
|
"attrs": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
|
||||||
|
"sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
|
||||||
|
],
|
||||||
|
"version": "==18.2.0"
|
||||||
|
},
|
||||||
|
"cleantext": {
|
||||||
|
"editable": true,
|
||||||
|
"extras": [
|
||||||
|
"gpl"
|
||||||
|
],
|
||||||
|
"path": "."
|
||||||
|
},
|
||||||
|
"ftfy": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:525ea45a871f52ddb170e66b01f35f1b3022995016c81efa305e628937b85443"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==5.5.0"
|
||||||
|
},
|
||||||
|
"more-itertools": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092",
|
||||||
|
"sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e",
|
||||||
|
"sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d"
|
||||||
|
],
|
||||||
|
"version": "==4.3.0"
|
||||||
|
},
|
||||||
|
"pluggy": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
|
||||||
|
"sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
|
||||||
|
],
|
||||||
|
"version": "==0.8.0"
|
||||||
|
},
|
||||||
|
"py": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
|
||||||
|
"sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
|
||||||
|
],
|
||||||
|
"version": "==1.7.0"
|
||||||
|
},
|
||||||
|
"pytest": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:f689bf2fc18c4585403348dd56f47d87780bf217c53ed9ae7a3e2d7faa45f8e9",
|
||||||
|
"sha256:f812ea39a0153566be53d88f8de94839db1e8a05352ed8a49525d7d7f37861e9"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==4.0.2"
|
||||||
|
},
|
||||||
|
"six": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||||
|
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
|
||||||
|
],
|
||||||
|
"version": "==1.12.0"
|
||||||
|
},
|
||||||
|
"unidecode": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:092cdf7ad9d1052c50313426a625b717dab52f7ac58f859e09ea020953b1ad8f",
|
||||||
|
"sha256:8b85354be8fd0c0e10adbf0675f6dc2310e56fda43fa8fe049123b6c475e52fb"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==1.0.23"
|
||||||
|
},
|
||||||
|
"wcwidth": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
|
||||||
|
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
|
||||||
|
],
|
||||||
|
"version": "==0.1.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"develop": {
|
||||||
|
"astroid": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22",
|
||||||
|
"sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e"
|
||||||
|
],
|
||||||
|
"version": "==2.1.0"
|
||||||
|
},
|
||||||
|
"isort": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
|
||||||
|
"sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8",
|
||||||
|
"sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"
|
||||||
|
],
|
||||||
|
"version": "==4.3.4"
|
||||||
|
},
|
||||||
|
"lazy-object-proxy": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
|
||||||
|
"sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
|
||||||
|
"sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
|
||||||
|
"sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
|
||||||
|
"sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
|
||||||
|
"sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
|
||||||
|
"sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
|
||||||
|
"sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
|
||||||
|
"sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
|
||||||
|
"sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
|
||||||
|
"sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
|
||||||
|
"sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
|
||||||
|
"sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
|
||||||
|
"sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
|
||||||
|
"sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
|
||||||
|
"sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
|
||||||
|
"sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
|
||||||
|
"sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
|
||||||
|
"sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
|
||||||
|
"sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
|
||||||
|
"sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
|
||||||
|
"sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
|
||||||
|
"sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
|
||||||
|
"sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
|
||||||
|
"sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
|
||||||
|
"sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
|
||||||
|
"sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
|
||||||
|
"sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a",
|
||||||
|
"sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b"
|
||||||
|
],
|
||||||
|
"version": "==1.3.1"
|
||||||
|
},
|
||||||
|
"mccabe": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||||
|
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||||
|
],
|
||||||
|
"version": "==0.6.1"
|
||||||
|
},
|
||||||
|
"pylint": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492",
|
||||||
|
"sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==2.2.2"
|
||||||
|
},
|
||||||
|
"six": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||||
|
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
|
||||||
|
],
|
||||||
|
"version": "==1.12.0"
|
||||||
|
},
|
||||||
|
"wrapt": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
|
||||||
|
],
|
||||||
|
"version": "==1.10.11"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
1
cleantext/__init__.py
Normal file
1
cleantext/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .clean import *
|
||||||
304
cleantext/clean.py
Normal file
304
cleantext/clean.py
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from ftfy import fix_text
|
||||||
|
|
||||||
|
from . import constants
|
||||||
|
|
||||||
|
# fall back to `unicodedata`
|
||||||
|
try:
|
||||||
|
from unidecode import unidecode
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
strange_double_quotes = [
|
||||||
|
"«",
|
||||||
|
"‹",
|
||||||
|
"»",
|
||||||
|
"›",
|
||||||
|
"„",
|
||||||
|
"“",
|
||||||
|
"‟",
|
||||||
|
"”",
|
||||||
|
"❝",
|
||||||
|
"❞",
|
||||||
|
"❮",
|
||||||
|
"❯",
|
||||||
|
"〝",
|
||||||
|
"〞",
|
||||||
|
"〟",
|
||||||
|
""",
|
||||||
|
]
|
||||||
|
strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", '‘','’']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fix_strange_quotes(text):
|
||||||
|
text = str(text)
|
||||||
|
for q in strange_double_quotes:
|
||||||
|
text = text.replace(q, '"')
|
||||||
|
for q in strange_single_quotes:
|
||||||
|
text = text.replace(q, "'")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def fix_bad_unicode(text, normalization="NFC"):
|
||||||
|
"""
|
||||||
|
Fix unicode text that's "broken" using `ftfy <http://ftfy.readthedocs.org/>`_;
|
||||||
|
this includes mojibake, HTML entities and other code cruft,
|
||||||
|
and non-standard forms for display purposes.
|
||||||
|
Args:
|
||||||
|
text (str): raw text
|
||||||
|
normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC',
|
||||||
|
combines characters and diacritics written using separate code points,
|
||||||
|
e.g. converting "e" plus an acute accent modifier into "é"; unicode
|
||||||
|
can be converted to NFC form without any change in its meaning!
|
||||||
|
if 'NFKC', additional normalizations are applied that can change
|
||||||
|
the meanings of characters, e.g. ellipsis characters will be replaced
|
||||||
|
with three periods
|
||||||
|
Returns:
|
||||||
|
str
|
||||||
|
"""
|
||||||
|
# fix if the unicode is fucked up
|
||||||
|
text = text.encode().decode("unicode-escape")
|
||||||
|
|
||||||
|
# normalize quotes before
|
||||||
|
text = fix_strange_quotes(text)
|
||||||
|
|
||||||
|
return fix_text(text, normalization=normalization)
|
||||||
|
|
||||||
|
|
||||||
|
def ascii_unicode(text):
|
||||||
|
"""
|
||||||
|
Try to represent unicode data in ascii characters similar to what a human
|
||||||
|
with a US keyboard would choose.
|
||||||
|
Works great for languages of Western origin, worse the farther the language
|
||||||
|
gets from Latin-based alphabets. It's based on hand-tuned character mappings
|
||||||
|
that also contain ascii approximations for symbols and non-Latin alphabets.
|
||||||
|
"""
|
||||||
|
return unidecode(text)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_whitespace(text):
|
||||||
|
"""
|
||||||
|
Given ``text`` str, replace one or more spacings with a single space, and one
|
||||||
|
or more linebreaks with a single newline. Also strip leading/trailing whitespace.
|
||||||
|
"""
|
||||||
|
return constants.NONBREAKING_SPACE_REGEX.sub(
|
||||||
|
" ", constants.LINEBREAK_REGEX.sub(r"\n", text)
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def unpack_contractions(text):
|
||||||
|
"""
|
||||||
|
Replace *English* contractions in ``text`` str with their unshortened forms.
|
||||||
|
N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
|
||||||
|
so are left as-is.
|
||||||
|
"""
|
||||||
|
# standard
|
||||||
|
text = re.sub(
|
||||||
|
r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't",
|
||||||
|
r"\1\2 not",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
text = re.sub(
|
||||||
|
r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
|
||||||
|
r"\1\2 will",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
|
||||||
|
text = re.sub(
|
||||||
|
r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve",
|
||||||
|
r"\1\2 have",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# non-standard
|
||||||
|
text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
|
||||||
|
text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
|
||||||
|
text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
|
||||||
|
text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
|
||||||
|
text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
|
||||||
|
text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def replace_urls(text, replace_with="<URL>"):
|
||||||
|
"""Replace all URLs in ``text`` str with ``replace_with`` str."""
|
||||||
|
return constants.URL_REGEX.sub(
|
||||||
|
replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_emails(text, replace_with="<EMAIL>"):
|
||||||
|
"""Replace all emails in ``text`` str with ``replace_with`` str."""
|
||||||
|
return constants.EMAIL_REGEX.sub(replace_with, text)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_phone_numbers(text, replace_with="<PHONE>"):
|
||||||
|
"""Replace all phone numbers in ``text`` str with ``replace_with`` str."""
|
||||||
|
return constants.PHONE_REGEX.sub(replace_with, text)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_numbers(text, replace_with="<NUMBER>"):
|
||||||
|
"""Replace all numbers in ``text`` str with ``replace_with`` str."""
|
||||||
|
return constants.NUMBERS_REGEX.sub(replace_with, text)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_currency_symbols(text, replace_with=None):
|
||||||
|
"""
|
||||||
|
Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str.
|
||||||
|
Args:
|
||||||
|
text (str): raw text
|
||||||
|
replace_with (str): if None (default), replace symbols with
|
||||||
|
their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
|
||||||
|
otherwise, pass in a string with which to replace all symbols
|
||||||
|
(e.g. "*CURRENCY*")
|
||||||
|
Returns:
|
||||||
|
str
|
||||||
|
"""
|
||||||
|
if replace_with is None:
|
||||||
|
for k, v in constants.CURRENCIES.items():
|
||||||
|
text = text.replace(k, v)
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
return constants.CURRENCY_REGEX.sub(replace_with, text)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_punct(text, marks=None):
|
||||||
|
"""
|
||||||
|
Remove punctuation from ``text`` by replacing all instances of ``marks``
|
||||||
|
with whitespace.
|
||||||
|
Args:
|
||||||
|
text (str): raw text
|
||||||
|
marks (str): If specified, remove only the characters in this string,
|
||||||
|
e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
|
||||||
|
Otherwise, all punctuation marks are removed.
|
||||||
|
Returns:
|
||||||
|
str
|
||||||
|
Note:
|
||||||
|
When ``marks=None``, Python's built-in :meth:`str.translate()` is
|
||||||
|
used to remove punctuation; otherwise, a regular expression is used
|
||||||
|
instead. The former's performance is about 5-10x faster.
|
||||||
|
"""
|
||||||
|
if marks:
|
||||||
|
return re.sub("[{}]+".format(re.escape(marks)), " ", text, flags=re.UNICODE)
|
||||||
|
else:
|
||||||
|
return text.translate(constants.PUNCT_TRANSLATE_UNICODE)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_accents(text, method="unicode"):
|
||||||
|
"""
|
||||||
|
Remove accents from any accented unicode characters in ``text`` str, either by
|
||||||
|
transforming them into ascii equivalents or removing them entirely.
|
||||||
|
Args:
|
||||||
|
text (str): raw text
|
||||||
|
method ({'unicode', 'ascii'}): if 'unicode', remove accented
|
||||||
|
char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
|
||||||
|
remove accented char for any unicode symbol
|
||||||
|
NB: the 'ascii' method is notably faster than 'unicode', but less good
|
||||||
|
Returns:
|
||||||
|
str
|
||||||
|
Raises:
|
||||||
|
ValueError: if ``method`` is not in {'unicode', 'ascii'}
|
||||||
|
"""
|
||||||
|
if method == "unicode":
|
||||||
|
return "".join(
|
||||||
|
c
|
||||||
|
for c in unicodedata.normalize("NFKD", text)
|
||||||
|
if not unicodedata.combining(c)
|
||||||
|
)
|
||||||
|
elif method == "ascii":
|
||||||
|
return (
|
||||||
|
unicodedata.normalize("NFKD", text)
|
||||||
|
.encode("ascii", errors="ignore")
|
||||||
|
.decode("ascii")
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def zero_digits(text):
|
||||||
|
"""
|
||||||
|
All digits are reduced to 0. 123.34 to 000.00
|
||||||
|
"""
|
||||||
|
return re.sub(r"\d", "0", text)
|
||||||
|
|
||||||
|
|
||||||
|
def clean(
|
||||||
|
text,
|
||||||
|
fix_unicode=False,
|
||||||
|
lower=False,
|
||||||
|
ascii=False,
|
||||||
|
no_urls=False,
|
||||||
|
no_emails=False,
|
||||||
|
no_phone_numbers=False,
|
||||||
|
no_numbers=False,
|
||||||
|
no_currency_symbols=False,
|
||||||
|
no_punct=False,
|
||||||
|
no_contractions=False,
|
||||||
|
no_accents=False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Normalize various aspects of a raw text doc before parsing it with Spacy.
|
||||||
|
A convenience function for applying all other preprocessing functions in one go.
|
||||||
|
Args:
|
||||||
|
text (str): raw text to preprocess
|
||||||
|
fix_unicode (bool): if True, fix "broken" unicode such as
|
||||||
|
mojibake and garbled HTML entities
|
||||||
|
lower (bool): if True, all text is lower-cased
|
||||||
|
ascii (bool): if True, convert non-ascii characters
|
||||||
|
into their closest ascii equivalents
|
||||||
|
no_urls (bool): if True, replace all URL strings with '*URL*'
|
||||||
|
no_emails (bool): if True, replace all email strings with '*EMAIL*'
|
||||||
|
no_phone_numbers (bool): if True, replace all phone number strings
|
||||||
|
with '*PHONE*'
|
||||||
|
no_numbers (bool): if True, replace all number-like strings
|
||||||
|
with '*NUMBER*'
|
||||||
|
no_currency_symbols (bool): if True, replace all currency symbols
|
||||||
|
with their standard 3-letter abbreviations
|
||||||
|
no_punct (bool): if True, remove all punctuation (replace with
|
||||||
|
empty string)
|
||||||
|
no_contractions (bool): if True, replace *English* contractions
|
||||||
|
with their unshortened forms
|
||||||
|
no_accents (bool): if True, replace all accented characters
|
||||||
|
with unaccented versions; NB: if `ascii` is True, this option
|
||||||
|
is redundant
|
||||||
|
Returns:
|
||||||
|
str: input ``text`` processed according to function args
|
||||||
|
Warning:
|
||||||
|
These changes may negatively affect subsequent NLP analysis performed
|
||||||
|
on the text, so choose carefully, and preprocess at your own risk!
|
||||||
|
"""
|
||||||
|
if fix_unicode is True:
|
||||||
|
text = fix_bad_unicode(text, normalization="NFC")
|
||||||
|
if ascii is True:
|
||||||
|
text = ascii_unicode(text)
|
||||||
|
if no_urls is True:
|
||||||
|
text = replace_urls(text)
|
||||||
|
if no_emails is True:
|
||||||
|
text = replace_emails(text)
|
||||||
|
if no_phone_numbers is True:
|
||||||
|
text = replace_phone_numbers(text)
|
||||||
|
if no_numbers is True:
|
||||||
|
text = replace_numbers(text)
|
||||||
|
if no_currency_symbols is True:
|
||||||
|
text = replace_currency_symbols(text)
|
||||||
|
if no_contractions is True:
|
||||||
|
text = unpack_contractions(text)
|
||||||
|
if no_accents is True:
|
||||||
|
text = remove_accents(text, method="unicode")
|
||||||
|
if no_punct is True:
|
||||||
|
text = remove_punct(text)
|
||||||
|
if lower is True:
|
||||||
|
text = text.lower()
|
||||||
|
# always normalize whitespace; treat linebreaks separately from spacing
|
||||||
|
text = normalize_whitespace(text)
|
||||||
|
|
||||||
|
return text
|
||||||
52
cleantext/compat.py
Normal file
52
cleantext/compat.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
is_python2 = int(sys.version[0]) == 2
|
||||||
|
is_windows = sys.platform.startswith("win")
|
||||||
|
is_linux = sys.platform.startswith("linux")
|
||||||
|
is_osx = sys.platform == "darwin"
|
||||||
|
|
||||||
|
if is_python2:
|
||||||
|
import cPickle as pickle
|
||||||
|
from backports import csv
|
||||||
|
from itertools import izip as zip_
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
|
range_ = xrange
|
||||||
|
|
||||||
|
bytes_ = str
|
||||||
|
unicode_ = unicode
|
||||||
|
string_types = (str, unicode)
|
||||||
|
int_types = (int, long)
|
||||||
|
chr_ = unichr
|
||||||
|
|
||||||
|
def unicode_to_bytes(s, encoding="utf8", errors="strict"):
|
||||||
|
return s.encode(encoding=encoding, errors=errors)
|
||||||
|
|
||||||
|
def bytes_to_unicode(b, encoding="utf8", errors="strict"):
|
||||||
|
return unicode_(b, encoding=encoding, errors=errors)
|
||||||
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
import csv
|
||||||
|
import pickle
|
||||||
|
from builtins import zip as zip_
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
range_ = range
|
||||||
|
|
||||||
|
bytes_ = bytes
|
||||||
|
unicode_ = str
|
||||||
|
string_types = (bytes, str)
|
||||||
|
int_types = (int,)
|
||||||
|
chr_ = chr
|
||||||
|
|
||||||
|
def unicode_to_bytes(s, encoding="utf8", errors="strict"):
|
||||||
|
return s.encode(encoding=encoding, errors=errors)
|
||||||
|
|
||||||
|
def bytes_to_unicode(b, encoding="utf8", errors="strict"):
|
||||||
|
return b.decode(encoding=encoding, errors=errors)
|
||||||
111
cleantext/constants.py
Normal file
111
cleantext/constants.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from . import compat
|
||||||
|
|
||||||
|
CURRENCIES = {
|
||||||
|
"$": "USD",
|
||||||
|
"zł": "PLN",
|
||||||
|
"£": "GBP",
|
||||||
|
"¥": "JPY",
|
||||||
|
"฿": "THB",
|
||||||
|
"₡": "CRC",
|
||||||
|
"₦": "NGN",
|
||||||
|
"₩": "KRW",
|
||||||
|
"₪": "ILS",
|
||||||
|
"₫": "VND",
|
||||||
|
"€": "EUR",
|
||||||
|
"₱": "PHP",
|
||||||
|
"₲": "PYG",
|
||||||
|
"₴": "UAH",
|
||||||
|
"₹": "INR",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
|
||||||
|
(
|
||||||
|
i
|
||||||
|
for i in compat.range_(sys.maxunicode)
|
||||||
|
if unicodedata.category(compat.chr_(i)).startswith("P")
|
||||||
|
),
|
||||||
|
u" ",
|
||||||
|
)
|
||||||
|
|
||||||
|
ACRONYM_REGEX = re.compile(
|
||||||
|
r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
|
||||||
|
flags=re.UNICODE,
|
||||||
|
)
|
||||||
|
EMAIL_REGEX = re.compile(
|
||||||
|
r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
|
||||||
|
flags=re.IGNORECASE | re.UNICODE,
|
||||||
|
)
|
||||||
|
PHONE_REGEX = re.compile(
|
||||||
|
r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"
|
||||||
|
)
|
||||||
|
NUMBERS_REGEX = re.compile(
|
||||||
|
r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
|
||||||
|
)
|
||||||
|
CURRENCY_REGEX = re.compile(
|
||||||
|
"({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
|
||||||
|
)
|
||||||
|
LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
|
||||||
|
NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
|
||||||
|
URL_REGEX = re.compile(
|
||||||
|
r"(?:^|(?<![\w/.]))"
|
||||||
|
# protocol identifier
|
||||||
|
# r"(?:(?:https?|ftp)://)" <-- alt?
|
||||||
|
r"(?:(?:https?://|ftp://|www\d{0,3}\.))"
|
||||||
|
# user:pass authentication
|
||||||
|
r"(?:\S+(?::\S*)?@)?" r"(?:"
|
||||||
|
# IP address exclusion
|
||||||
|
# private & local networks
|
||||||
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
||||||
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
||||||
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
||||||
|
# IP address dotted notation octets
|
||||||
|
# excludes loopback network 0.0.0.0
|
||||||
|
# excludes reserved space >= 224.0.0.0
|
||||||
|
# excludes network & broadcast addresses
|
||||||
|
# (first & last IP address of each class)
|
||||||
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
||||||
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
||||||
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
||||||
|
r"|"
|
||||||
|
# host name
|
||||||
|
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
|
||||||
|
# domain name
|
||||||
|
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
|
||||||
|
# TLD identifier
|
||||||
|
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")"
|
||||||
|
# port number
|
||||||
|
r"(?::\d{2,5})?"
|
||||||
|
# resource path
|
||||||
|
r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))",
|
||||||
|
flags=re.UNICODE | re.IGNORECASE,
|
||||||
|
) # source: https://gist.github.com/dperini/729294
|
||||||
|
|
||||||
|
SHORT_URL_REGEX = re.compile(
|
||||||
|
r"(?:^|(?<![\w/.]))"
|
||||||
|
# optional scheme
|
||||||
|
r"(?:(?:https?://)?)"
|
||||||
|
# domain
|
||||||
|
r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}" r"/"
|
||||||
|
# hash
|
||||||
|
r"[^\s.,?!'\"|+]{2,12}" r"(?:$|(?![\w?!+&/]))",
|
||||||
|
flags=re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# regexes for cleaning up crufty terms
|
||||||
|
DANGLING_PARENS_TERM_RE = re.compile(
|
||||||
|
r"(?:\s|^)(\()\s{1,2}(.*?)\s{1,2}(\))(?:\s|$)", flags=re.UNICODE
|
||||||
|
)
|
||||||
|
LEAD_TAIL_CRUFT_TERM_RE = re.compile(r"^([^\w(-] ?)+|([^\w).!?] ?)+$", flags=re.UNICODE)
|
||||||
|
LEAD_HYPHEN_TERM_RE = re.compile(r"^-([^\W\d_])", flags=re.UNICODE)
|
||||||
|
NEG_DIGIT_TERM_RE = re.compile(r"(-) (\d)", flags=re.UNICODE)
|
||||||
|
WEIRD_HYPHEN_SPACE_TERM_RE = re.compile(r"(?<=[^\W\d]) (-[^\W\d])", flags=re.UNICODE)
|
||||||
|
WEIRD_APOSTR_SPACE_TERM_RE = re.compile(r"([^\W\d]+) ('[a-z]{1,2}\b)", flags=re.UNICODE)
|
||||||
28
setup.py
Normal file
28
setup.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
from setuptools import setup
|
||||||
|
from setuptools import find_packages
|
||||||
|
|
||||||
|
with open("README.md", "r") as fh:
|
||||||
|
long_description = fh.read()
|
||||||
|
|
||||||
|
classifiers = [
|
||||||
|
'Programming Language :: Python :: 3.5',
|
||||||
|
'Programming Language :: Python :: 3.6',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
]
|
||||||
|
|
||||||
|
version = '0.0.0'
|
||||||
|
|
||||||
|
setup(name='cleantext',
|
||||||
|
version=version,
|
||||||
|
description='Clean your dirty text',
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
author='Johannes Filter',
|
||||||
|
author_email='ragha@outlook.com, hi@jfilter.de',
|
||||||
|
url='https://github.com/jfilter/clean-text',
|
||||||
|
license='MIT',
|
||||||
|
install_requires=['ftfy'],
|
||||||
|
extras_require={'gpl': ['unidecode']},
|
||||||
|
include_package_data=True,
|
||||||
|
classifiers=classifiers,
|
||||||
|
packages=find_packages())
|
||||||
88
tests/test_clean.py
Normal file
88
tests/test_clean.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
import cleantext
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_whitespace():
|
||||||
|
text = "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
|
||||||
|
proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
|
||||||
|
assert cleantext.normalize_whitespace(text) == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_unpack_contractions():
|
||||||
|
text = "Y'all can't believe you're not who they've said I'll become, but shouldn't."
|
||||||
|
proc_text = "You all can not believe you are not who they have said I will become, but should not."
|
||||||
|
assert cleantext.unpack_contractions(text) == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_urls():
|
||||||
|
text = "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom."
|
||||||
|
proc_text = "I learned everything I know from *URL* and *URL* and Mom."
|
||||||
|
assert cleantext.replace_urls(text, "*URL*") == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_emails():
|
||||||
|
text = "I can be reached at username@example.com through next Friday."
|
||||||
|
proc_text = "I can be reached at *EMAIL* through next Friday."
|
||||||
|
assert cleantext.replace_emails(text, "*EMAIL*") == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_phone_numbers():
|
||||||
|
text = "I can be reached at 555-123-4567 through next Friday."
|
||||||
|
proc_text = "I can be reached at *PHONE* through next Friday."
|
||||||
|
assert cleantext.replace_phone_numbers(text, "*PHONE*") == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_numbers():
|
||||||
|
text = "I owe $1,000.99 to 123 people for 2 +1 reasons."
|
||||||
|
proc_text = "I owe $*NUM* to *NUM* people for *NUM* *NUM* reasons."
|
||||||
|
assert cleantext.replace_numbers(text, "*NUM*") == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_punct():
|
||||||
|
text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
|
||||||
|
proc_text = "I can t No I won t It s a matter of principle of what s the word conscience "
|
||||||
|
assert cleantext.remove_punct(text) == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_punct_marks():
|
||||||
|
text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
|
||||||
|
proc_text = "I can t. No, I won t! It s a matter of principle ; of what s the word? conscience."
|
||||||
|
assert cleantext.remove_punct(text, marks="-'\"") == proc_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_replace_currency_symbols():
|
||||||
|
tests = [
|
||||||
|
(
|
||||||
|
"$1.00 equals £0.67 equals €0.91.",
|
||||||
|
"USD1.00 equals GBP0.67 equals EUR0.91.",
|
||||||
|
"*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"this zebra costs $100.",
|
||||||
|
"this zebra costs USD100.",
|
||||||
|
"this zebra costs *CUR* 100.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
for text, proc_text1, proc_text2 in tests:
|
||||||
|
assert cleantext.replace_currency_symbols(text, replace_with=None) == proc_text1
|
||||||
|
assert (
|
||||||
|
cleantext.replace_currency_symbols(text, replace_with="*CUR* ")
|
||||||
|
== proc_text2
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_accents():
|
||||||
|
text = "El niño se asustó -- qué miedo!"
|
||||||
|
proc_text = "El nino se asusto -- que miedo!"
|
||||||
|
assert cleantext.remove_accents(text, method="unicode") == proc_text
|
||||||
|
assert cleantext.remove_accents(text, method="ascii") == proc_text
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
_ = cleantext.remove_accents(text, method="foo")
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_unicode():
|
||||||
|
text = (
|
||||||
|
"and install a \\u2018new\\u2019 society in their"
|
||||||
|
) # and install a ‘new’ society in their
|
||||||
|
assert cleantext.fix_bad_unicode(text) == "and install a 'new' society in their"
|
||||||
Reference in New Issue
Block a user