diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..5683ee0
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,17 @@
+# http://editorconfig.org
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+insert_final_newline = true
+trim_trailing_whitespace = true
+end_of_line = lf
+charset = utf-8
+
+[*.py]
+max_line_length = 119
+
+[*.md]
+insert_final_newline = false
diff --git a/.gitignore b/.gitignore
index 894a44c..7d3392b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,4 @@ venv.bak/
# mypy
.mypy_cache/
+.vscode
diff --git a/LICENSE b/LICENSE
index b9148f5..866c37d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,18 @@
-MIT License
+Copyright 2016 Chartbeat, Inc.
-Copyright (c) 2018 Johannes Filter
+Modified by Johannes Fillter, 2018
+- only use `constants.py`, `preprocess.py` (renamed to clean.py) and `compat.py`
+- only use `test_preprocess` and renamed to `test_clean.py`
+- modification to the code to add new features
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+ http://www.apache.org/licenses/LICENSE-2.0
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..2c92e7a
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,17 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pylint = "*"
+black = "*"
+
+[packages]
+cleantext = {editable = true,extras = ["gpl"],path = "."}
+ftfy = "*"
+unidecode = "*"
+pytest = "*"
+
+[requires]
+python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..ebe33ca
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,179 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "933b2d6b2d62530ba230ca07596269006bc8c9d645c358603b0ebc8cda215f06"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.7"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "atomicwrites": {
+ "hashes": [
+ "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
+ "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
+ ],
+ "version": "==1.2.1"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
+ "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
+ ],
+ "version": "==18.2.0"
+ },
+ "cleantext": {
+ "editable": true,
+ "extras": [
+ "gpl"
+ ],
+ "path": "."
+ },
+ "ftfy": {
+ "hashes": [
+ "sha256:525ea45a871f52ddb170e66b01f35f1b3022995016c81efa305e628937b85443"
+ ],
+ "index": "pypi",
+ "version": "==5.5.0"
+ },
+ "more-itertools": {
+ "hashes": [
+ "sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092",
+ "sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e",
+ "sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d"
+ ],
+ "version": "==4.3.0"
+ },
+ "pluggy": {
+ "hashes": [
+ "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
+ "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
+ ],
+ "version": "==0.8.0"
+ },
+ "py": {
+ "hashes": [
+ "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
+ "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
+ ],
+ "version": "==1.7.0"
+ },
+ "pytest": {
+ "hashes": [
+ "sha256:f689bf2fc18c4585403348dd56f47d87780bf217c53ed9ae7a3e2d7faa45f8e9",
+ "sha256:f812ea39a0153566be53d88f8de94839db1e8a05352ed8a49525d7d7f37861e9"
+ ],
+ "index": "pypi",
+ "version": "==4.0.2"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "unidecode": {
+ "hashes": [
+ "sha256:092cdf7ad9d1052c50313426a625b717dab52f7ac58f859e09ea020953b1ad8f",
+ "sha256:8b85354be8fd0c0e10adbf0675f6dc2310e56fda43fa8fe049123b6c475e52fb"
+ ],
+ "index": "pypi",
+ "version": "==1.0.23"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ }
+ },
+ "develop": {
+ "astroid": {
+ "hashes": [
+ "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22",
+ "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e"
+ ],
+ "version": "==2.1.0"
+ },
+ "isort": {
+ "hashes": [
+ "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
+ "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8",
+ "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"
+ ],
+ "version": "==4.3.4"
+ },
+ "lazy-object-proxy": {
+ "hashes": [
+ "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
+ "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
+ "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
+ "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
+ "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
+ "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
+ "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
+ "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
+ "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
+ "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
+ "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
+ "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
+ "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
+ "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
+ "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
+ "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
+ "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
+ "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
+ "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
+ "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
+ "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
+ "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
+ "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
+ "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
+ "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
+ "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
+ "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
+ "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a",
+ "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b"
+ ],
+ "version": "==1.3.1"
+ },
+ "mccabe": {
+ "hashes": [
+ "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+ "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ ],
+ "version": "==0.6.1"
+ },
+ "pylint": {
+ "hashes": [
+ "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492",
+ "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c"
+ ],
+ "index": "pypi",
+ "version": "==2.2.2"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "wrapt": {
+ "hashes": [
+ "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
+ ],
+ "version": "==1.10.11"
+ }
+ }
+}
diff --git a/cleantext/__init__.py b/cleantext/__init__.py
new file mode 100644
index 0000000..cbfcda9
--- /dev/null
+++ b/cleantext/__init__.py
@@ -0,0 +1 @@
+from .clean import *
diff --git a/cleantext/clean.py b/cleantext/clean.py
new file mode 100644
index 0000000..ae7c070
--- /dev/null
+++ b/cleantext/clean.py
@@ -0,0 +1,304 @@
+"""
+
+"""
+
+import re
+import unicodedata
+
+from ftfy import fix_text
+
+from . import constants
+
+# fall back to `unicodedata`
+try:
+ from unidecode import unidecode
+except:
+ pass
+
+
+strange_double_quotes = [
+ "«",
+ "‹",
+ "»",
+ "›",
+ "„",
+ "“",
+ "‟",
+ "”",
+ "❝",
+ "❞",
+ "❮",
+ "❯",
+ "〝",
+ "〞",
+ "〟",
+ """,
+]
+strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", '‘','’']
+
+
+
+
+def fix_strange_quotes(text):
+ text = str(text)
+ for q in strange_double_quotes:
+ text = text.replace(q, '"')
+ for q in strange_single_quotes:
+ text = text.replace(q, "'")
+ return text
+
+
+def fix_bad_unicode(text, normalization="NFC"):
+ """
+ Fix unicode text that's "broken" using `ftfy `_;
+ this includes mojibake, HTML entities and other code cruft,
+ and non-standard forms for display purposes.
+ Args:
+ text (str): raw text
+ normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC',
+ combines characters and diacritics written using separate code points,
+ e.g. converting "e" plus an acute accent modifier into "é"; unicode
+ can be converted to NFC form without any change in its meaning!
+ if 'NFKC', additional normalizations are applied that can change
+ the meanings of characters, e.g. ellipsis characters will be replaced
+ with three periods
+ Returns:
+ str
+ """
+ # fix if the unicode is fucked up
+ text = text.encode().decode("unicode-escape")
+
+ # normalize quotes before
+ text = fix_strange_quotes(text)
+
+ return fix_text(text, normalization=normalization)
+
+
+def ascii_unicode(text):
+ """
+ Try to represent unicode data in ascii characters similar to what a human
+ with a US keyboard would choose.
+ Works great for languages of Western origin, worse the farther the language
+ gets from Latin-based alphabets. It's based on hand-tuned character mappings
+ that also contain ascii approximations for symbols and non-Latin alphabets.
+ """
+ return unidecode(text)
+
+
+def normalize_whitespace(text):
+ """
+ Given ``text`` str, replace one or more spacings with a single space, and one
+ or more linebreaks with a single newline. Also strip leading/trailing whitespace.
+ """
+ return constants.NONBREAKING_SPACE_REGEX.sub(
+ " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
+ ).strip()
+
+
+def unpack_contractions(text):
+ """
+ Replace *English* contractions in ``text`` str with their unshortened forms.
+ N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
+ so are left as-is.
+ """
+ # standard
+ text = re.sub(
+ r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't",
+ r"\1\2 not",
+ text,
+ )
+ text = re.sub(
+ r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
+ r"\1\2 will",
+ text,
+ )
+ text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
+ text = re.sub(
+ r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve",
+ r"\1\2 have",
+ text,
+ )
+ # non-standard
+ text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
+ text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
+ text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
+ text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
+ text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
+ text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
+ return text
+
+
+def replace_urls(text, replace_with=""):
+ """Replace all URLs in ``text`` str with ``replace_with`` str."""
+ return constants.URL_REGEX.sub(
+ replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text)
+ )
+
+
+def replace_emails(text, replace_with=""):
+ """Replace all emails in ``text`` str with ``replace_with`` str."""
+ return constants.EMAIL_REGEX.sub(replace_with, text)
+
+
+def replace_phone_numbers(text, replace_with=""):
+ """Replace all phone numbers in ``text`` str with ``replace_with`` str."""
+ return constants.PHONE_REGEX.sub(replace_with, text)
+
+
+def replace_numbers(text, replace_with=""):
+ """Replace all numbers in ``text`` str with ``replace_with`` str."""
+ return constants.NUMBERS_REGEX.sub(replace_with, text)
+
+
+def replace_currency_symbols(text, replace_with=None):
+ """
+ Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str.
+ Args:
+ text (str): raw text
+ replace_with (str): if None (default), replace symbols with
+ their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
+ otherwise, pass in a string with which to replace all symbols
+ (e.g. "*CURRENCY*")
+ Returns:
+ str
+ """
+ if replace_with is None:
+ for k, v in constants.CURRENCIES.items():
+ text = text.replace(k, v)
+ return text
+ else:
+ return constants.CURRENCY_REGEX.sub(replace_with, text)
+
+
+def remove_punct(text, marks=None):
+ """
+ Remove punctuation from ``text`` by replacing all instances of ``marks``
+ with whitespace.
+ Args:
+ text (str): raw text
+ marks (str): If specified, remove only the characters in this string,
+ e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
+ Otherwise, all punctuation marks are removed.
+ Returns:
+ str
+ Note:
+ When ``marks=None``, Python's built-in :meth:`str.translate()` is
+ used to remove punctuation; otherwise, a regular expression is used
+ instead. The former's performance is about 5-10x faster.
+ """
+ if marks:
+ return re.sub("[{}]+".format(re.escape(marks)), " ", text, flags=re.UNICODE)
+ else:
+ return text.translate(constants.PUNCT_TRANSLATE_UNICODE)
+
+
+def remove_accents(text, method="unicode"):
+ """
+ Remove accents from any accented unicode characters in ``text`` str, either by
+ transforming them into ascii equivalents or removing them entirely.
+ Args:
+ text (str): raw text
+ method ({'unicode', 'ascii'}): if 'unicode', remove accented
+ char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
+ remove accented char for any unicode symbol
+ NB: the 'ascii' method is notably faster than 'unicode', but less good
+ Returns:
+ str
+ Raises:
+ ValueError: if ``method`` is not in {'unicode', 'ascii'}
+ """
+ if method == "unicode":
+ return "".join(
+ c
+ for c in unicodedata.normalize("NFKD", text)
+ if not unicodedata.combining(c)
+ )
+ elif method == "ascii":
+ return (
+ unicodedata.normalize("NFKD", text)
+ .encode("ascii", errors="ignore")
+ .decode("ascii")
+ )
+ else:
+ msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
+ raise ValueError(msg)
+
+
+def zero_digits(text):
+ """
+ All digits are reduced to 0. 123.34 to 000.00
+ """
+ return re.sub(r"\d", "0", text)
+
+
+def clean(
+ text,
+ fix_unicode=False,
+ lower=False,
+ ascii=False,
+ no_urls=False,
+ no_emails=False,
+ no_phone_numbers=False,
+ no_numbers=False,
+ no_currency_symbols=False,
+ no_punct=False,
+ no_contractions=False,
+ no_accents=False,
+):
+ """
+ Normalize various aspects of a raw text doc before parsing it with Spacy.
+ A convenience function for applying all other preprocessing functions in one go.
+ Args:
+ text (str): raw text to preprocess
+ fix_unicode (bool): if True, fix "broken" unicode such as
+ mojibake and garbled HTML entities
+ lower (bool): if True, all text is lower-cased
+ ascii (bool): if True, convert non-ascii characters
+ into their closest ascii equivalents
+ no_urls (bool): if True, replace all URL strings with '*URL*'
+ no_emails (bool): if True, replace all email strings with '*EMAIL*'
+ no_phone_numbers (bool): if True, replace all phone number strings
+ with '*PHONE*'
+ no_numbers (bool): if True, replace all number-like strings
+ with '*NUMBER*'
+ no_currency_symbols (bool): if True, replace all currency symbols
+ with their standard 3-letter abbreviations
+ no_punct (bool): if True, remove all punctuation (replace with
+ empty string)
+ no_contractions (bool): if True, replace *English* contractions
+ with their unshortened forms
+ no_accents (bool): if True, replace all accented characters
+ with unaccented versions; NB: if `ascii` is True, this option
+ is redundant
+ Returns:
+ str: input ``text`` processed according to function args
+ Warning:
+ These changes may negatively affect subsequent NLP analysis performed
+ on the text, so choose carefully, and preprocess at your own risk!
+ """
+ if fix_unicode is True:
+ text = fix_bad_unicode(text, normalization="NFC")
+ if ascii is True:
+ text = ascii_unicode(text)
+ if no_urls is True:
+ text = replace_urls(text)
+ if no_emails is True:
+ text = replace_emails(text)
+ if no_phone_numbers is True:
+ text = replace_phone_numbers(text)
+ if no_numbers is True:
+ text = replace_numbers(text)
+ if no_currency_symbols is True:
+ text = replace_currency_symbols(text)
+ if no_contractions is True:
+ text = unpack_contractions(text)
+ if no_accents is True:
+ text = remove_accents(text, method="unicode")
+ if no_punct is True:
+ text = remove_punct(text)
+ if lower is True:
+ text = text.lower()
+ # always normalize whitespace; treat linebreaks separately from spacing
+ text = normalize_whitespace(text)
+
+ return text
diff --git a/cleantext/compat.py b/cleantext/compat.py
new file mode 100644
index 0000000..ad91d35
--- /dev/null
+++ b/cleantext/compat.py
@@ -0,0 +1,52 @@
+"""
+"""
+
+from __future__ import print_function
+
+import sys
+
+is_python2 = int(sys.version[0]) == 2
+is_windows = sys.platform.startswith("win")
+is_linux = sys.platform.startswith("linux")
+is_osx = sys.platform == "darwin"
+
+if is_python2:
+ import cPickle as pickle
+ from backports import csv
+ from itertools import izip as zip_
+ from urlparse import urljoin
+
+ range_ = xrange
+
+ bytes_ = str
+ unicode_ = unicode
+ string_types = (str, unicode)
+ int_types = (int, long)
+ chr_ = unichr
+
+ def unicode_to_bytes(s, encoding="utf8", errors="strict"):
+ return s.encode(encoding=encoding, errors=errors)
+
+ def bytes_to_unicode(b, encoding="utf8", errors="strict"):
+ return unicode_(b, encoding=encoding, errors=errors)
+
+
+else:
+ import csv
+ import pickle
+ from builtins import zip as zip_
+ from urllib.parse import urljoin
+
+ range_ = range
+
+ bytes_ = bytes
+ unicode_ = str
+ string_types = (bytes, str)
+ int_types = (int,)
+ chr_ = chr
+
+ def unicode_to_bytes(s, encoding="utf8", errors="strict"):
+ return s.encode(encoding=encoding, errors=errors)
+
+ def bytes_to_unicode(b, encoding="utf8", errors="strict"):
+ return b.decode(encoding=encoding, errors=errors)
diff --git a/cleantext/constants.py b/cleantext/constants.py
new file mode 100644
index 0000000..4763a66
--- /dev/null
+++ b/cleantext/constants.py
@@ -0,0 +1,111 @@
+"""
+
+"""
+
+import re
+import sys
+import unicodedata
+
+from . import compat
+
+CURRENCIES = {
+ "$": "USD",
+ "zł": "PLN",
+ "£": "GBP",
+ "¥": "JPY",
+ "฿": "THB",
+ "₡": "CRC",
+ "₦": "NGN",
+ "₩": "KRW",
+ "₪": "ILS",
+ "₫": "VND",
+ "€": "EUR",
+ "₱": "PHP",
+ "₲": "PYG",
+ "₴": "UAH",
+ "₹": "INR",
+}
+
+
+PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
+ (
+ i
+ for i in compat.range_(sys.maxunicode)
+ if unicodedata.category(compat.chr_(i)).startswith("P")
+ ),
+ u" ",
+)
+
+ACRONYM_REGEX = re.compile(
+ r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
+ flags=re.UNICODE,
+)
+EMAIL_REGEX = re.compile(
+ r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
+ flags=re.IGNORECASE | re.UNICODE,
+)
+PHONE_REGEX = re.compile(
+ r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"
+)
+NUMBERS_REGEX = re.compile(
+ r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
+)
+CURRENCY_REGEX = re.compile(
+ "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
+)
+LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
+NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
+URL_REGEX = re.compile(
+ r"(?:^|(?= 224.0.0.0
+ # excludes network & broadcast addresses
+ # (first & last IP address of each class)
+ r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
+ r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
+ r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
+ r"|"
+ # host name
+ r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
+ # domain name
+ r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
+ # TLD identifier
+ r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")"
+ # port number
+ r"(?::\d{2,5})?"
+ # resource path
+ r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))",
+ flags=re.UNICODE | re.IGNORECASE,
+) # source: https://gist.github.com/dperini/729294
+
+SHORT_URL_REGEX = re.compile(
+ r"(?:^|(?