From d0c1eb6077d3b42d3c48232765c276ffe9f04daa Mon Sep 17 00:00:00 2001 From: Johannes Filter Date: Fri, 21 Dec 2018 21:18:08 +0100 Subject: [PATCH] getting stuff done --- .editorconfig | 17 +++ .gitignore | 1 + LICENSE | 31 ++--- Pipfile | 17 +++ Pipfile.lock | 179 ++++++++++++++++++++++++ cleantext/__init__.py | 1 + cleantext/clean.py | 304 +++++++++++++++++++++++++++++++++++++++++ cleantext/compat.py | 52 +++++++ cleantext/constants.py | 111 +++++++++++++++ setup.py | 28 ++++ tests/test_clean.py | 88 ++++++++++++ 11 files changed, 812 insertions(+), 17 deletions(-) create mode 100644 .editorconfig create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 cleantext/__init__.py create mode 100644 cleantext/clean.py create mode 100644 cleantext/compat.py create mode 100644 cleantext/constants.py create mode 100644 setup.py create mode 100644 tests/test_clean.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..5683ee0 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,17 @@ +# http://editorconfig.org + +root = true + +[*] +indent_style = space +indent_size = 4 +insert_final_newline = true +trim_trailing_whitespace = true +end_of_line = lf +charset = utf-8 + +[*.py] +max_line_length = 119 + +[*.md] +insert_final_newline = false diff --git a/.gitignore b/.gitignore index 894a44c..7d3392b 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,4 @@ venv.bak/ # mypy .mypy_cache/ +.vscode diff --git a/LICENSE b/LICENSE index b9148f5..866c37d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,18 @@ -MIT License +Copyright 2016 Chartbeat, Inc. -Copyright (c) 2018 Johannes Filter +Modified by Johannes Fillter, 2018 +- only use `constants.py`, `preprocess.py` (renamed to clean.py) and `compat.py` +- only use `test_preprocess` and renamed to `test_clean.py` +- modification to the code to add new features -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. + http://www.apache.org/licenses/LICENSE-2.0 -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..2c92e7a --- /dev/null +++ b/Pipfile @@ -0,0 +1,17 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +pylint = "*" +black = "*" + +[packages] +cleantext = {editable = true,extras = ["gpl"],path = "."} +ftfy = "*" +unidecode = "*" +pytest = "*" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..ebe33ca --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,179 @@ +{ + "_meta": { + "hash": { + "sha256": "933b2d6b2d62530ba230ca07596269006bc8c9d645c358603b0ebc8cda215f06" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "atomicwrites": { + "hashes": [ + "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0", + "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee" + ], + "version": "==1.2.1" + }, + "attrs": { + "hashes": [ + "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", + "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" + ], + "version": "==18.2.0" + }, + "cleantext": { + "editable": true, + "extras": [ + "gpl" + ], + "path": "." + }, + "ftfy": { + "hashes": [ + "sha256:525ea45a871f52ddb170e66b01f35f1b3022995016c81efa305e628937b85443" + ], + "index": "pypi", + "version": "==5.5.0" + }, + "more-itertools": { + "hashes": [ + "sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092", + "sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e", + "sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d" + ], + "version": "==4.3.0" + }, + "pluggy": { + "hashes": [ + "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095", + "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f" + ], + "version": "==0.8.0" + }, + "py": { + "hashes": [ + "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694", + "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6" + ], + "version": "==1.7.0" + }, + "pytest": { + "hashes": [ + "sha256:f689bf2fc18c4585403348dd56f47d87780bf217c53ed9ae7a3e2d7faa45f8e9", + "sha256:f812ea39a0153566be53d88f8de94839db1e8a05352ed8a49525d7d7f37861e9" + ], + "index": "pypi", + "version": "==4.0.2" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "unidecode": { + "hashes": [ + "sha256:092cdf7ad9d1052c50313426a625b717dab52f7ac58f859e09ea020953b1ad8f", + "sha256:8b85354be8fd0c0e10adbf0675f6dc2310e56fda43fa8fe049123b6c475e52fb" + ], + "index": "pypi", + "version": "==1.0.23" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + } + }, + "develop": { + "astroid": { + "hashes": [ + "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22", + "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e" + ], + "version": "==2.1.0" + }, + "isort": { + "hashes": [ + "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", + "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", + "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497" + ], + "version": "==4.3.4" + }, + "lazy-object-proxy": { + "hashes": [ + "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33", + "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39", + "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019", + "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088", + "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b", + "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e", + "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6", + "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b", + "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5", + "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff", + "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd", + "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7", + "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff", + "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d", + "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2", + "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35", + "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4", + "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514", + "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252", + "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109", + "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f", + "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c", + "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92", + "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577", + "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d", + "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d", + "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f", + "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a", + "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b" + ], + "version": "==1.3.1" + }, + "mccabe": { + "hashes": [ + "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", + "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" + ], + "version": "==0.6.1" + }, + "pylint": { + "hashes": [ + "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492", + "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c" + ], + "index": "pypi", + "version": "==2.2.2" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "wrapt": { + "hashes": [ + "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" + ], + "version": "==1.10.11" + } + } +} diff --git a/cleantext/__init__.py b/cleantext/__init__.py new file mode 100644 index 0000000..cbfcda9 --- /dev/null +++ b/cleantext/__init__.py @@ -0,0 +1 @@ +from .clean import * diff --git a/cleantext/clean.py b/cleantext/clean.py new file mode 100644 index 0000000..ae7c070 --- /dev/null +++ b/cleantext/clean.py @@ -0,0 +1,304 @@ +""" + +""" + +import re +import unicodedata + +from ftfy import fix_text + +from . import constants + +# fall back to `unicodedata` +try: + from unidecode import unidecode +except: + pass + + +strange_double_quotes = [ + "«", + "‹", + "»", + "›", + "„", + "“", + "‟", + "”", + "❝", + "❞", + "❮", + "❯", + "〝", + "〞", + "〟", + """, +] +strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", '‘','’'] + + + + +def fix_strange_quotes(text): + text = str(text) + for q in strange_double_quotes: + text = text.replace(q, '"') + for q in strange_single_quotes: + text = text.replace(q, "'") + return text + + +def fix_bad_unicode(text, normalization="NFC"): + """ + Fix unicode text that's "broken" using `ftfy `_; + this includes mojibake, HTML entities and other code cruft, + and non-standard forms for display purposes. + Args: + text (str): raw text + normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC', + combines characters and diacritics written using separate code points, + e.g. converting "e" plus an acute accent modifier into "é"; unicode + can be converted to NFC form without any change in its meaning! + if 'NFKC', additional normalizations are applied that can change + the meanings of characters, e.g. ellipsis characters will be replaced + with three periods + Returns: + str + """ + # fix if the unicode is fucked up + text = text.encode().decode("unicode-escape") + + # normalize quotes before + text = fix_strange_quotes(text) + + return fix_text(text, normalization=normalization) + + +def ascii_unicode(text): + """ + Try to represent unicode data in ascii characters similar to what a human + with a US keyboard would choose. + Works great for languages of Western origin, worse the farther the language + gets from Latin-based alphabets. It's based on hand-tuned character mappings + that also contain ascii approximations for symbols and non-Latin alphabets. + """ + return unidecode(text) + + +def normalize_whitespace(text): + """ + Given ``text`` str, replace one or more spacings with a single space, and one + or more linebreaks with a single newline. Also strip leading/trailing whitespace. + """ + return constants.NONBREAKING_SPACE_REGEX.sub( + " ", constants.LINEBREAK_REGEX.sub(r"\n", text) + ).strip() + + +def unpack_contractions(text): + """ + Replace *English* contractions in ``text`` str with their unshortened forms. + N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive), + so are left as-is. + """ + # standard + text = re.sub( + r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", + r"\1\2 not", + text, + ) + text = re.sub( + r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", + r"\1\2 will", + text, + ) + text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text) + text = re.sub( + r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", + r"\1\2 have", + text, + ) + # non-standard + text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text) + text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text) + text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text) + text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text) + text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text) + text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text) + return text + + +def replace_urls(text, replace_with=""): + """Replace all URLs in ``text`` str with ``replace_with`` str.""" + return constants.URL_REGEX.sub( + replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text) + ) + + +def replace_emails(text, replace_with=""): + """Replace all emails in ``text`` str with ``replace_with`` str.""" + return constants.EMAIL_REGEX.sub(replace_with, text) + + +def replace_phone_numbers(text, replace_with=""): + """Replace all phone numbers in ``text`` str with ``replace_with`` str.""" + return constants.PHONE_REGEX.sub(replace_with, text) + + +def replace_numbers(text, replace_with=""): + """Replace all numbers in ``text`` str with ``replace_with`` str.""" + return constants.NUMBERS_REGEX.sub(replace_with, text) + + +def replace_currency_symbols(text, replace_with=None): + """ + Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str. + Args: + text (str): raw text + replace_with (str): if None (default), replace symbols with + their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP'); + otherwise, pass in a string with which to replace all symbols + (e.g. "*CURRENCY*") + Returns: + str + """ + if replace_with is None: + for k, v in constants.CURRENCIES.items(): + text = text.replace(k, v) + return text + else: + return constants.CURRENCY_REGEX.sub(replace_with, text) + + +def remove_punct(text, marks=None): + """ + Remove punctuation from ``text`` by replacing all instances of ``marks`` + with whitespace. + Args: + text (str): raw text + marks (str): If specified, remove only the characters in this string, + e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. + Otherwise, all punctuation marks are removed. + Returns: + str + Note: + When ``marks=None``, Python's built-in :meth:`str.translate()` is + used to remove punctuation; otherwise, a regular expression is used + instead. The former's performance is about 5-10x faster. + """ + if marks: + return re.sub("[{}]+".format(re.escape(marks)), " ", text, flags=re.UNICODE) + else: + return text.translate(constants.PUNCT_TRANSLATE_UNICODE) + + +def remove_accents(text, method="unicode"): + """ + Remove accents from any accented unicode characters in ``text`` str, either by + transforming them into ascii equivalents or removing them entirely. + Args: + text (str): raw text + method ({'unicode', 'ascii'}): if 'unicode', remove accented + char for any unicode symbol with a direct ASCII equivalent; if 'ascii', + remove accented char for any unicode symbol + NB: the 'ascii' method is notably faster than 'unicode', but less good + Returns: + str + Raises: + ValueError: if ``method`` is not in {'unicode', 'ascii'} + """ + if method == "unicode": + return "".join( + c + for c in unicodedata.normalize("NFKD", text) + if not unicodedata.combining(c) + ) + elif method == "ascii": + return ( + unicodedata.normalize("NFKD", text) + .encode("ascii", errors="ignore") + .decode("ascii") + ) + else: + msg = '`method` must be either "unicode" and "ascii", not {}'.format(method) + raise ValueError(msg) + + +def zero_digits(text): + """ + All digits are reduced to 0. 123.34 to 000.00 + """ + return re.sub(r"\d", "0", text) + + +def clean( + text, + fix_unicode=False, + lower=False, + ascii=False, + no_urls=False, + no_emails=False, + no_phone_numbers=False, + no_numbers=False, + no_currency_symbols=False, + no_punct=False, + no_contractions=False, + no_accents=False, +): + """ + Normalize various aspects of a raw text doc before parsing it with Spacy. + A convenience function for applying all other preprocessing functions in one go. + Args: + text (str): raw text to preprocess + fix_unicode (bool): if True, fix "broken" unicode such as + mojibake and garbled HTML entities + lower (bool): if True, all text is lower-cased + ascii (bool): if True, convert non-ascii characters + into their closest ascii equivalents + no_urls (bool): if True, replace all URL strings with '*URL*' + no_emails (bool): if True, replace all email strings with '*EMAIL*' + no_phone_numbers (bool): if True, replace all phone number strings + with '*PHONE*' + no_numbers (bool): if True, replace all number-like strings + with '*NUMBER*' + no_currency_symbols (bool): if True, replace all currency symbols + with their standard 3-letter abbreviations + no_punct (bool): if True, remove all punctuation (replace with + empty string) + no_contractions (bool): if True, replace *English* contractions + with their unshortened forms + no_accents (bool): if True, replace all accented characters + with unaccented versions; NB: if `ascii` is True, this option + is redundant + Returns: + str: input ``text`` processed according to function args + Warning: + These changes may negatively affect subsequent NLP analysis performed + on the text, so choose carefully, and preprocess at your own risk! + """ + if fix_unicode is True: + text = fix_bad_unicode(text, normalization="NFC") + if ascii is True: + text = ascii_unicode(text) + if no_urls is True: + text = replace_urls(text) + if no_emails is True: + text = replace_emails(text) + if no_phone_numbers is True: + text = replace_phone_numbers(text) + if no_numbers is True: + text = replace_numbers(text) + if no_currency_symbols is True: + text = replace_currency_symbols(text) + if no_contractions is True: + text = unpack_contractions(text) + if no_accents is True: + text = remove_accents(text, method="unicode") + if no_punct is True: + text = remove_punct(text) + if lower is True: + text = text.lower() + # always normalize whitespace; treat linebreaks separately from spacing + text = normalize_whitespace(text) + + return text diff --git a/cleantext/compat.py b/cleantext/compat.py new file mode 100644 index 0000000..ad91d35 --- /dev/null +++ b/cleantext/compat.py @@ -0,0 +1,52 @@ +""" +""" + +from __future__ import print_function + +import sys + +is_python2 = int(sys.version[0]) == 2 +is_windows = sys.platform.startswith("win") +is_linux = sys.platform.startswith("linux") +is_osx = sys.platform == "darwin" + +if is_python2: + import cPickle as pickle + from backports import csv + from itertools import izip as zip_ + from urlparse import urljoin + + range_ = xrange + + bytes_ = str + unicode_ = unicode + string_types = (str, unicode) + int_types = (int, long) + chr_ = unichr + + def unicode_to_bytes(s, encoding="utf8", errors="strict"): + return s.encode(encoding=encoding, errors=errors) + + def bytes_to_unicode(b, encoding="utf8", errors="strict"): + return unicode_(b, encoding=encoding, errors=errors) + + +else: + import csv + import pickle + from builtins import zip as zip_ + from urllib.parse import urljoin + + range_ = range + + bytes_ = bytes + unicode_ = str + string_types = (bytes, str) + int_types = (int,) + chr_ = chr + + def unicode_to_bytes(s, encoding="utf8", errors="strict"): + return s.encode(encoding=encoding, errors=errors) + + def bytes_to_unicode(b, encoding="utf8", errors="strict"): + return b.decode(encoding=encoding, errors=errors) diff --git a/cleantext/constants.py b/cleantext/constants.py new file mode 100644 index 0000000..4763a66 --- /dev/null +++ b/cleantext/constants.py @@ -0,0 +1,111 @@ +""" + +""" + +import re +import sys +import unicodedata + +from . import compat + +CURRENCIES = { + "$": "USD", + "zł": "PLN", + "£": "GBP", + "¥": "JPY", + "฿": "THB", + "₡": "CRC", + "₦": "NGN", + "₩": "KRW", + "₪": "ILS", + "₫": "VND", + "€": "EUR", + "₱": "PHP", + "₲": "PYG", + "₴": "UAH", + "₹": "INR", +} + + +PUNCT_TRANSLATE_UNICODE = dict.fromkeys( + ( + i + for i in compat.range_(sys.maxunicode) + if unicodedata.category(compat.chr_(i)).startswith("P") + ), + u" ", +) + +ACRONYM_REGEX = re.compile( + r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))", + flags=re.UNICODE, +) +EMAIL_REGEX = re.compile( + r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))", + flags=re.IGNORECASE | re.UNICODE, +) +PHONE_REGEX = re.compile( + r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))" +) +NUMBERS_REGEX = re.compile( + r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))" +) +CURRENCY_REGEX = re.compile( + "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys())) +) +LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+") +NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+") +URL_REGEX = re.compile( + r"(?:^|(?= 224.0.0.0 + # excludes network & broadcast addresses + # (first & last IP address of each class) + r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" + r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" + r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" + r"|" + # host name + r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" + # domain name + r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" + # TLD identifier + r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")" + # port number + r"(?::\d{2,5})?" + # resource path + r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))", + flags=re.UNICODE | re.IGNORECASE, +) # source: https://gist.github.com/dperini/729294 + +SHORT_URL_REGEX = re.compile( + r"(?:^|(?