added clean_text function

2021-10-04 01:29:04 +03:00 · 2020-09-17 23:54:03 -05:00
parent e10fc6d8c9
commit 78c0373fe0
3 changed files with 109 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -115,6 +115,20 @@ download_from(url, path)



+### clean_text [[source]](https://github.com/gmihaila/ml_things/blob/9ea16e6df75a907fadf8c40b29ef7b3da9d37701/src/ml_things/text_functions.py#L22)
+
+```python
+clean_text(text, full_clean=False, punctuation=False, numbers=False, lower=False, extra_spaces=False,
+               control_characters=False, tokenize_whitespace=False, remove_characters='')
+```
+
+
+|Description:|Clean text using various techniques.|
+|:-|:-|
+|**Parameters:**|**:param** <br>&nbsp;&nbsp;  text: string that needs cleaning. <br>**:param** <br>&nbsp;&nbsp;  full_clean: remove: punctuation, numbers, extra space, control characters and lower case. <br>**:param** <br>&nbsp;&nbsp;  punctuation: remove punctuation from text. <br>**:param** <br>&nbsp;&nbsp;  numbers: remove digits from text. <br>**:param** <br>&nbsp;&nbsp;  lower: lower case all text. <br>**:param** <br>&nbsp;&nbsp;  extra_spaces: remove extra spaces - everything beyond one space. <br>**:param** <br>&nbsp;&nbsp;  control_characters: remove characters like `\n`, `\t` etc. <br>**:param** <br>&nbsp;&nbsp;  tokenize_whitespace: return a list of tokens split on whitespace. <br>**:param** <br>&nbsp;&nbsp;  remove_characters: remove defined characters form text. <br>|
+|**Returns:**|**:return:** <br>&nbsp;&nbsp; cleaned text or list of tokens of cleaned text.|
+
+
 # Snippets

 This is a very large variety of Python snippets without a certain theme. I put them in the most frequently used ones while keeping a logical order.
--- a/src/ml_things/init.py
+++ b/src/ml_things/init.py
@@ -10,6 +10,7 @@ from .array_functions import (pad_array,
 from .web_related import (download_from)
 from .plot_functions import (plot_array,
                             plot_confusion_matrix)
+from .text_functions import (clean_text)
 # installed ftfy to fix any UNICODE problems in text data
 from ftfy import fix_text

--- a/src/ml_things/text_functions.py
+++ b/src/ml_things/text_functions.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2020 George Mihaila.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions that deal with text/string"""
+
+import re
+import copy
+import string
+
+
+def clean_text(text, full_clean=False, punctuation=False, numbers=False, lower=False, extra_spaces=False,
+               control_characters=False, tokenize_whitespace=False, remove_characters=''):
+    """Clean text using various techniques.
+
+    :param text: string that needs cleaning
+    :param full_clean: remove: punctuation, numbers, extra space, control characters and lower case
+    :param punctuation: remove punctuation from text.
+    :param numbers: remove digits from text.
+    :param lower: lower case all text.
+    :param extra_spaces: remove extra spaces - everything beyond one space.
+    :param control_characters: remove characters like `\n`, `\t` etc.
+    :param tokenize_whitespace: return a list of tokens split on whitespace.
+    :param remove_characters: remove defined characters form text.
+    :return: cleaned text or list of tokens of cleaned text.
+    """
+
+    if not isinstance(text, str):
+        # `text` is not type of string
+        raise ValueError("`text` is not of type str!")
+
+    if not isinstance(remove_characters, str):
+        # remove characters need to be a string
+        raise ValueError("`remove_characters` needs to be a string!")
+
+    # all control characters like `\t` `\n` `\r` etc.
+    # Stack Overflow: https://stackoverflow.com/a/8115378/11281368
+    control_characters_list = ''.join([chr(char) for char in range(1, 32)])
+
+    # define control characters table
+    table_control_characters = str.maketrans(dict.fromkeys(control_characters_list))
+
+    # remove punctuation table
+    table_punctuation = str.maketrans(dict.fromkeys(string.punctuation))
+
+    # remove numbers table
+    table_digits = str.maketrans(dict.fromkeys('0123456789'))
+
+    # remove certain characters table
+    table_remove_characters = str.maketrans(dict.fromkeys(remove_characters))
+
+    # make a copy of text to make sure it doesn't affect original text
+    cleaned = copy.deepcopy(text)
+
+    if full_clean or punctuation:
+        # remove punctuation
+        cleaned = cleaned.translate(table_punctuation)
+
+    if full_clean or numbers:
+        # remove numbers
+        cleaned = cleaned.translate(table_digits)
+
+    if full_clean or extra_spaces:
+        # remove extra spaces - also removes control characters
+        # Stack Overflow https://stackoverflow.com/a/2077906/11281368
+        cleaned = re.sub('\s+', ' ', cleaned).strip()
+
+    if full_clean or lower:
+        # lowercase
+        cleaned = cleaned.lower()
+
+    if control_characters:
+        # remove control characters
+        cleaned = cleaned.translate(table_control_characters)
+
+    if tokenize_whitespace:
+        # tokenizes text n whitespace
+        cleaned = re.split('\s+', cleaned)
+
+    if remove_characters:
+        # remove these characters from text
+        cleaned = cleaned.translate(table_remove_characters)
+
+    return cleaned