From d0c1eb6077d3b42d3c48232765c276ffe9f04daa Mon Sep 17 00:00:00 2001
From: Johannes Filter <hi@jfilter.de>
Date: Fri, 21 Dec 2018 21:18:08 +0100
Subject: [PATCH] getting stuff done

---
 .editorconfig          |  17 +++
 .gitignore             |   1 +
 LICENSE                |  31 ++---
 Pipfile                |  17 +++
 Pipfile.lock           | 179 ++++++++++++++++++++++++
 cleantext/__init__.py  |   1 +
 cleantext/clean.py     | 304 +++++++++++++++++++++++++++++++++++++++++
 cleantext/compat.py    |  52 +++++++
 cleantext/constants.py | 111 +++++++++++++++
 setup.py               |  28 ++++
 tests/test_clean.py    |  88 ++++++++++++
 11 files changed, 812 insertions(+), 17 deletions(-)
 create mode 100644 .editorconfig
 create mode 100644 Pipfile
 create mode 100644 Pipfile.lock
 create mode 100644 cleantext/__init__.py
 create mode 100644 cleantext/clean.py
 create mode 100644 cleantext/compat.py
 create mode 100644 cleantext/constants.py
 create mode 100644 setup.py
 create mode 100644 tests/test_clean.py

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..5683ee0
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,17 @@
+# http://editorconfig.org
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+insert_final_newline = true
+trim_trailing_whitespace = true
+end_of_line = lf
+charset = utf-8
+
+[*.py]
+max_line_length = 119
+
+[*.md]
+insert_final_newline = false
diff --git a/.gitignore b/.gitignore
index 894a44c..7d3392b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,4 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+.vscode
diff --git a/LICENSE b/LICENSE
index b9148f5..866c37d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,18 @@
-MIT License
+Copyright 2016 Chartbeat, Inc.
 
-Copyright (c) 2018 Johannes Filter
+Modified by Johannes Fillter, 2018
+- only use `constants.py`, `preprocess.py` (renamed to clean.py) and `compat.py`
+- only use `test_preprocess` and renamed to `test_clean.py`
+- modification to the code to add new features
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+  http://www.apache.org/licenses/LICENSE-2.0
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..2c92e7a
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,17 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pylint = "*"
+black = "*"
+
+[packages]
+cleantext = {editable = true,extras = ["gpl"],path = "."}
+ftfy = "*"
+unidecode = "*"
+pytest = "*"
+
+[requires]
+python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..ebe33ca
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,179 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "933b2d6b2d62530ba230ca07596269006bc8c9d645c358603b0ebc8cda215f06"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.7"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "atomicwrites": {
+            "hashes": [
+                "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
+                "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
+            ],
+            "version": "==1.2.1"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
+                "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
+            ],
+            "version": "==18.2.0"
+        },
+        "cleantext": {
+            "editable": true,
+            "extras": [
+                "gpl"
+            ],
+            "path": "."
+        },
+        "ftfy": {
+            "hashes": [
+                "sha256:525ea45a871f52ddb170e66b01f35f1b3022995016c81efa305e628937b85443"
+            ],
+            "index": "pypi",
+            "version": "==5.5.0"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092",
+                "sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e",
+                "sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d"
+            ],
+            "version": "==4.3.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
+                "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
+            ],
+            "version": "==0.8.0"
+        },
+        "py": {
+            "hashes": [
+                "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
+                "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
+            ],
+            "version": "==1.7.0"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:f689bf2fc18c4585403348dd56f47d87780bf217c53ed9ae7a3e2d7faa45f8e9",
+                "sha256:f812ea39a0153566be53d88f8de94839db1e8a05352ed8a49525d7d7f37861e9"
+            ],
+            "index": "pypi",
+            "version": "==4.0.2"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "unidecode": {
+            "hashes": [
+                "sha256:092cdf7ad9d1052c50313426a625b717dab52f7ac58f859e09ea020953b1ad8f",
+                "sha256:8b85354be8fd0c0e10adbf0675f6dc2310e56fda43fa8fe049123b6c475e52fb"
+            ],
+            "index": "pypi",
+            "version": "==1.0.23"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        }
+    },
+    "develop": {
+        "astroid": {
+            "hashes": [
+                "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22",
+                "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e"
+            ],
+            "version": "==2.1.0"
+        },
+        "isort": {
+            "hashes": [
+                "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
+                "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8",
+                "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"
+            ],
+            "version": "==4.3.4"
+        },
+        "lazy-object-proxy": {
+            "hashes": [
+                "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
+                "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
+                "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
+                "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
+                "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
+                "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
+                "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
+                "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
+                "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
+                "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
+                "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
+                "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
+                "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
+                "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
+                "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
+                "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
+                "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
+                "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
+                "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
+                "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
+                "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
+                "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
+                "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
+                "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
+                "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
+                "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
+                "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
+                "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a",
+                "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b"
+            ],
+            "version": "==1.3.1"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "pylint": {
+            "hashes": [
+                "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492",
+                "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c"
+            ],
+            "index": "pypi",
+            "version": "==2.2.2"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "wrapt": {
+            "hashes": [
+                "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
+            ],
+            "version": "==1.10.11"
+        }
+    }
+}
diff --git a/cleantext/__init__.py b/cleantext/__init__.py
new file mode 100644
index 0000000..cbfcda9
--- /dev/null
+++ b/cleantext/__init__.py
@@ -0,0 +1 @@
+from .clean import *
diff --git a/cleantext/clean.py b/cleantext/clean.py
new file mode 100644
index 0000000..ae7c070
--- /dev/null
+++ b/cleantext/clean.py
@@ -0,0 +1,304 @@
+"""
+
+"""
+
+import re
+import unicodedata
+
+from ftfy import fix_text
+
+from . import constants
+
+# fall back to `unicodedata`
+try:
+    from unidecode import unidecode
+except:
+    pass
+
+
+strange_double_quotes = [
+    "«",
+    "‹",
+    "»",
+    "›",
+    "„",
+    "“",
+    "‟",
+    "”",
+    "❝",
+    "❞",
+    "❮",
+    "❯",
+    "〝",
+    "〞",
+    "〟",
+    "＂",
+]
+strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", '‘','’']
+
+ 
+
+
+def fix_strange_quotes(text):
+    text = str(text)
+    for q in strange_double_quotes:
+        text = text.replace(q, '"')
+    for q in strange_single_quotes:
+        text = text.replace(q, "'")
+    return text
+
+
+def fix_bad_unicode(text, normalization="NFC"):
+    """
+    Fix unicode text that's "broken" using `ftfy <http://ftfy.readthedocs.org/>`_;
+    this includes mojibake, HTML entities and other code cruft,
+    and non-standard forms for display purposes.
+    Args:
+        text (str): raw text
+        normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC',
+            combines characters and diacritics written using separate code points,
+            e.g. converting "e" plus an acute accent modifier into "é"; unicode
+            can be converted to NFC form without any change in its meaning!
+            if 'NFKC', additional normalizations are applied that can change
+            the meanings of characters, e.g. ellipsis characters will be replaced
+            with three periods
+    Returns:
+        str
+    """
+    # fix if the unicode is fucked up
+    text = text.encode().decode("unicode-escape")
+
+    # normalize quotes before
+    text = fix_strange_quotes(text)
+
+    return fix_text(text, normalization=normalization)
+
+
+def ascii_unicode(text):
+    """
+    Try to represent unicode data in ascii characters similar to what a human
+    with a US keyboard would choose.
+    Works great for languages of Western origin, worse the farther the language
+    gets from Latin-based alphabets. It's based on hand-tuned character mappings
+    that also contain ascii approximations for symbols and non-Latin alphabets.
+    """
+    return unidecode(text)
+
+
+def normalize_whitespace(text):
+    """
+    Given ``text`` str, replace one or more spacings with a single space, and one
+    or more linebreaks with a single newline. Also strip leading/trailing whitespace.
+    """
+    return constants.NONBREAKING_SPACE_REGEX.sub(
+        " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
+    ).strip()
+
+
+def unpack_contractions(text):
+    """
+    Replace *English* contractions in ``text`` str with their unshortened forms.
+    N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
+    so are left as-is.
+    """
+    # standard
+    text = re.sub(
+        r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't",
+        r"\1\2 not",
+        text,
+    )
+    text = re.sub(
+        r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll",
+        r"\1\2 will",
+        text,
+    )
+    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
+    text = re.sub(
+        r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve",
+        r"\1\2 have",
+        text,
+    )
+    # non-standard
+    text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
+    text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
+    text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
+    text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
+    text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
+    text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
+    return text
+
+
+def replace_urls(text, replace_with="<URL>"):
+    """Replace all URLs in ``text`` str with ``replace_with`` str."""
+    return constants.URL_REGEX.sub(
+        replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text)
+    )
+
+
+def replace_emails(text, replace_with="<EMAIL>"):
+    """Replace all emails in ``text`` str with ``replace_with`` str."""
+    return constants.EMAIL_REGEX.sub(replace_with, text)
+
+
+def replace_phone_numbers(text, replace_with="<PHONE>"):
+    """Replace all phone numbers in ``text`` str with ``replace_with`` str."""
+    return constants.PHONE_REGEX.sub(replace_with, text)
+
+
+def replace_numbers(text, replace_with="<NUMBER>"):
+    """Replace all numbers in ``text`` str with ``replace_with`` str."""
+    return constants.NUMBERS_REGEX.sub(replace_with, text)
+
+
+def replace_currency_symbols(text, replace_with=None):
+    """
+    Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str.
+    Args:
+        text (str): raw text
+        replace_with (str): if None (default), replace symbols with
+            their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
+            otherwise, pass in a string with which to replace all symbols
+            (e.g. "*CURRENCY*")
+    Returns:
+        str
+    """
+    if replace_with is None:
+        for k, v in constants.CURRENCIES.items():
+            text = text.replace(k, v)
+        return text
+    else:
+        return constants.CURRENCY_REGEX.sub(replace_with, text)
+
+
+def remove_punct(text, marks=None):
+    """
+    Remove punctuation from ``text`` by replacing all instances of ``marks``
+    with whitespace.
+    Args:
+        text (str): raw text
+        marks (str): If specified, remove only the characters in this string,
+            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
+            Otherwise, all punctuation marks are removed.
+    Returns:
+        str
+    Note:
+        When ``marks=None``, Python's built-in :meth:`str.translate()` is
+        used to remove punctuation; otherwise, a regular expression is used
+        instead. The former's performance is about 5-10x faster.
+    """
+    if marks:
+        return re.sub("[{}]+".format(re.escape(marks)), " ", text, flags=re.UNICODE)
+    else:
+        return text.translate(constants.PUNCT_TRANSLATE_UNICODE)
+
+
+def remove_accents(text, method="unicode"):
+    """
+    Remove accents from any accented unicode characters in ``text`` str, either by
+    transforming them into ascii equivalents or removing them entirely.
+    Args:
+        text (str): raw text
+        method ({'unicode', 'ascii'}): if 'unicode', remove accented
+            char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
+            remove accented char for any unicode symbol
+            NB: the 'ascii' method is notably faster than 'unicode', but less good
+    Returns:
+        str
+    Raises:
+        ValueError: if ``method`` is not in {'unicode', 'ascii'}
+    """
+    if method == "unicode":
+        return "".join(
+            c
+            for c in unicodedata.normalize("NFKD", text)
+            if not unicodedata.combining(c)
+        )
+    elif method == "ascii":
+        return (
+            unicodedata.normalize("NFKD", text)
+            .encode("ascii", errors="ignore")
+            .decode("ascii")
+        )
+    else:
+        msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
+        raise ValueError(msg)
+
+
+def zero_digits(text):
+    """
+    All digits are reduced to 0. 123.34 to 000.00
+    """
+    return re.sub(r"\d", "0", text)
+
+
+def clean(
+    text,
+    fix_unicode=False,
+    lower=False,
+    ascii=False,
+    no_urls=False,
+    no_emails=False,
+    no_phone_numbers=False,
+    no_numbers=False,
+    no_currency_symbols=False,
+    no_punct=False,
+    no_contractions=False,
+    no_accents=False,
+):
+    """
+    Normalize various aspects of a raw text doc before parsing it with Spacy.
+    A convenience function for applying all other preprocessing functions in one go.
+    Args:
+        text (str): raw text to preprocess
+        fix_unicode (bool): if True, fix "broken" unicode such as
+            mojibake and garbled HTML entities
+        lower (bool): if True, all text is lower-cased
+        ascii (bool): if True, convert non-ascii characters
+            into their closest ascii equivalents
+        no_urls (bool): if True, replace all URL strings with '*URL*'
+        no_emails (bool): if True, replace all email strings with '*EMAIL*'
+        no_phone_numbers (bool): if True, replace all phone number strings
+            with '*PHONE*'
+        no_numbers (bool): if True, replace all number-like strings
+            with '*NUMBER*'
+        no_currency_symbols (bool): if True, replace all currency symbols
+            with their standard 3-letter abbreviations
+        no_punct (bool): if True, remove all punctuation (replace with
+            empty string)
+        no_contractions (bool): if True, replace *English* contractions
+            with their unshortened forms
+        no_accents (bool): if True, replace all accented characters
+            with unaccented versions; NB: if `ascii` is True, this option
+            is redundant
+    Returns:
+        str: input ``text`` processed according to function args
+    Warning:
+        These changes may negatively affect subsequent NLP analysis performed
+        on the text, so choose carefully, and preprocess at your own risk!
+    """
+    if fix_unicode is True:
+        text = fix_bad_unicode(text, normalization="NFC")
+    if ascii is True:
+        text = ascii_unicode(text)
+    if no_urls is True:
+        text = replace_urls(text)
+    if no_emails is True:
+        text = replace_emails(text)
+    if no_phone_numbers is True:
+        text = replace_phone_numbers(text)
+    if no_numbers is True:
+        text = replace_numbers(text)
+    if no_currency_symbols is True:
+        text = replace_currency_symbols(text)
+    if no_contractions is True:
+        text = unpack_contractions(text)
+    if no_accents is True:
+        text = remove_accents(text, method="unicode")
+    if no_punct is True:
+        text = remove_punct(text)
+    if lower is True:
+        text = text.lower()
+    # always normalize whitespace; treat linebreaks separately from spacing
+    text = normalize_whitespace(text)
+
+    return text
diff --git a/cleantext/compat.py b/cleantext/compat.py
new file mode 100644
index 0000000..ad91d35
--- /dev/null
+++ b/cleantext/compat.py
@@ -0,0 +1,52 @@
+"""
+"""
+
+from __future__ import print_function
+
+import sys
+
+is_python2 = int(sys.version[0]) == 2
+is_windows = sys.platform.startswith("win")
+is_linux = sys.platform.startswith("linux")
+is_osx = sys.platform == "darwin"
+
+if is_python2:
+    import cPickle as pickle
+    from backports import csv
+    from itertools import izip as zip_
+    from urlparse import urljoin
+
+    range_ = xrange
+
+    bytes_ = str
+    unicode_ = unicode
+    string_types = (str, unicode)
+    int_types = (int, long)
+    chr_ = unichr
+
+    def unicode_to_bytes(s, encoding="utf8", errors="strict"):
+        return s.encode(encoding=encoding, errors=errors)
+
+    def bytes_to_unicode(b, encoding="utf8", errors="strict"):
+        return unicode_(b, encoding=encoding, errors=errors)
+
+
+else:
+    import csv
+    import pickle
+    from builtins import zip as zip_
+    from urllib.parse import urljoin
+
+    range_ = range
+
+    bytes_ = bytes
+    unicode_ = str
+    string_types = (bytes, str)
+    int_types = (int,)
+    chr_ = chr
+
+    def unicode_to_bytes(s, encoding="utf8", errors="strict"):
+        return s.encode(encoding=encoding, errors=errors)
+
+    def bytes_to_unicode(b, encoding="utf8", errors="strict"):
+        return b.decode(encoding=encoding, errors=errors)
diff --git a/cleantext/constants.py b/cleantext/constants.py
new file mode 100644
index 0000000..4763a66
--- /dev/null
+++ b/cleantext/constants.py
@@ -0,0 +1,111 @@
+"""
+
+"""
+
+import re
+import sys
+import unicodedata
+
+from . import compat
+
+CURRENCIES = {
+    "$": "USD",
+    "zł": "PLN",
+    "£": "GBP",
+    "¥": "JPY",
+    "฿": "THB",
+    "₡": "CRC",
+    "₦": "NGN",
+    "₩": "KRW",
+    "₪": "ILS",
+    "₫": "VND",
+    "€": "EUR",
+    "₱": "PHP",
+    "₲": "PYG",
+    "₴": "UAH",
+    "₹": "INR",
+}
+
+
+PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
+    (
+        i
+        for i in compat.range_(sys.maxunicode)
+        if unicodedata.category(compat.chr_(i)).startswith("P")
+    ),
+    u" ",
+)
+
+ACRONYM_REGEX = re.compile(
+    r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
+    flags=re.UNICODE,
+)
+EMAIL_REGEX = re.compile(
+    r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
+    flags=re.IGNORECASE | re.UNICODE,
+)
+PHONE_REGEX = re.compile(
+    r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"
+)
+NUMBERS_REGEX = re.compile(
+    r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
+)
+CURRENCY_REGEX = re.compile(
+    "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
+)
+LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
+NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
+URL_REGEX = re.compile(
+    r"(?:^|(?<![\w/.]))"
+    # protocol identifier
+    # r"(?:(?:https?|ftp)://)"  <-- alt?
+    r"(?:(?:https?://|ftp://|www\d{0,3}\.))"
+    # user:pass authentication
+    r"(?:\S+(?::\S*)?@)?" r"(?:"
+    # IP address exclusion
+    # private & local networks
+    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
+    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
+    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
+    # IP address dotted notation octets
+    # excludes loopback network 0.0.0.0
+    # excludes reserved space >= 224.0.0.0
+    # excludes network & broadcast addresses
+    # (first & last IP address of each class)
+    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
+    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
+    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
+    r"|"
+    # host name
+    r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
+    # domain name
+    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
+    # TLD identifier
+    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")"
+    # port number
+    r"(?::\d{2,5})?"
+    # resource path
+    r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))",
+    flags=re.UNICODE | re.IGNORECASE,
+)  # source: https://gist.github.com/dperini/729294
+
+SHORT_URL_REGEX = re.compile(
+    r"(?:^|(?<![\w/.]))"
+    # optional scheme
+    r"(?:(?:https?://)?)"
+    # domain
+    r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}" r"/"
+    # hash
+    r"[^\s.,?!'\"|+]{2,12}" r"(?:$|(?![\w?!+&/]))",
+    flags=re.IGNORECASE,
+)
+
+# regexes for cleaning up crufty terms
+DANGLING_PARENS_TERM_RE = re.compile(
+    r"(?:\s|^)(\()\s{1,2}(.*?)\s{1,2}(\))(?:\s|$)", flags=re.UNICODE
+)
+LEAD_TAIL_CRUFT_TERM_RE = re.compile(r"^([^\w(-] ?)+|([^\w).!?] ?)+$", flags=re.UNICODE)
+LEAD_HYPHEN_TERM_RE = re.compile(r"^-([^\W\d_])", flags=re.UNICODE)
+NEG_DIGIT_TERM_RE = re.compile(r"(-) (\d)", flags=re.UNICODE)
+WEIRD_HYPHEN_SPACE_TERM_RE = re.compile(r"(?<=[^\W\d]) (-[^\W\d])", flags=re.UNICODE)
+WEIRD_APOSTR_SPACE_TERM_RE = re.compile(r"([^\W\d]+) ('[a-z]{1,2}\b)", flags=re.UNICODE)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..2313672
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup
+from setuptools import find_packages
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+classifiers = [
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+    'License :: OSI Approved :: MIT License',
+]
+
+version = '0.0.0'
+
+setup(name='cleantext',
+      version=version,
+      description='Clean your dirty text',
+      long_description=long_description,
+      long_description_content_type="text/markdown",
+      author='Johannes Filter',
+      author_email='ragha@outlook.com, hi@jfilter.de',
+      url='https://github.com/jfilter/clean-text',
+      license='MIT',
+      install_requires=['ftfy'],
+      extras_require={'gpl': ['unidecode']},
+      include_package_data=True,
+      classifiers=classifiers,
+packages=find_packages())
diff --git a/tests/test_clean.py b/tests/test_clean.py
new file mode 100644
index 0000000..06bd275
--- /dev/null
+++ b/tests/test_clean.py
@@ -0,0 +1,88 @@
+import pytest
+
+import cleantext
+
+
+def test_normalize_whitespace():
+    text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
+    proc_text = "Hello, world! Hello... world?\nHello:\nWorld."
+    assert cleantext.normalize_whitespace(text) == proc_text
+
+
+def test_unpack_contractions():
+    text = "Y'all can't believe you're not who they've said I'll become, but shouldn't."
+    proc_text = "You all can not believe you are not who they have said I will become, but should not."
+    assert cleantext.unpack_contractions(text) == proc_text
+
+
+def test_replace_urls():
+    text = "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom."
+    proc_text = "I learned everything I know from *URL* and *URL* and Mom."
+    assert cleantext.replace_urls(text, "*URL*") == proc_text
+
+
+def test_replace_emails():
+    text = "I can be reached at username@example.com through next Friday."
+    proc_text = "I can be reached at *EMAIL* through next Friday."
+    assert cleantext.replace_emails(text, "*EMAIL*") == proc_text
+
+
+def test_replace_phone_numbers():
+    text = "I can be reached at 555-123-4567 through next Friday."
+    proc_text = "I can be reached at *PHONE* through next Friday."
+    assert cleantext.replace_phone_numbers(text, "*PHONE*") == proc_text
+
+
+def test_replace_numbers():
+    text = "I owe $1,000.99 to 123 people for 2 +1 reasons."
+    proc_text = "I owe $*NUM* to *NUM* people for *NUM* *NUM* reasons."
+    assert cleantext.replace_numbers(text, "*NUM*") == proc_text
+
+
+def test_remove_punct():
+    text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
+    proc_text = "I can t  No  I won t  It s a matter of  principle   of    what s the word     conscience "
+    assert cleantext.remove_punct(text) == proc_text
+
+
+def test_remove_punct_marks():
+    text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
+    proc_text = "I can t. No, I won t! It s a matter of  principle ; of   what s the word?   conscience."
+    assert cleantext.remove_punct(text, marks="-'\"") == proc_text
+
+
+def test_replace_currency_symbols():
+    tests = [
+        (
+            "$1.00 equals £0.67 equals €0.91.",
+            "USD1.00 equals GBP0.67 equals EUR0.91.",
+            "*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.",
+        ),
+        (
+            "this zebra costs $100.",
+            "this zebra costs USD100.",
+            "this zebra costs *CUR* 100.",
+        ),
+    ]
+    for text, proc_text1, proc_text2 in tests:
+        assert cleantext.replace_currency_symbols(text, replace_with=None) == proc_text1
+        assert (
+            cleantext.replace_currency_symbols(text, replace_with="*CUR* ")
+            == proc_text2
+        )
+
+
+def test_remove_accents():
+    text = "El niño se asustó -- qué miedo!"
+    proc_text = "El nino se asusto -- que miedo!"
+    assert cleantext.remove_accents(text, method="unicode") == proc_text
+    assert cleantext.remove_accents(text, method="ascii") == proc_text
+    with pytest.raises(Exception):
+        _ = cleantext.remove_accents(text, method="foo")
+
+
+def test_fix_unicode():
+    text = (
+        "and install a \\u2018new\\u2019 society in their"
+    )  # and install a ‘new’ society in their
+    assert cleantext.fix_bad_unicode(text) == "and install a 'new' society in their"