added clean_text function

This commit is contained in:
georgemihaila
2020-09-17 23:54:03 -05:00
parent e10fc6d8c9
commit 78c0373fe0
3 changed files with 109 additions and 0 deletions

View File

@@ -115,6 +115,20 @@ download_from(url, path)
### clean_text [[source]](https://github.com/gmihaila/ml_things/blob/9ea16e6df75a907fadf8c40b29ef7b3da9d37701/src/ml_things/text_functions.py#L22)
```python
clean_text(text, full_clean=False, punctuation=False, numbers=False, lower=False, extra_spaces=False,
control_characters=False, tokenize_whitespace=False, remove_characters='')
```
|Description:|Clean text using various techniques.|
|:-|:-|
|**Parameters:**|**:param** <br>&nbsp;&nbsp; text: string that needs cleaning. <br>**:param** <br>&nbsp;&nbsp; full_clean: remove: punctuation, numbers, extra space, control characters and lower case. <br>**:param** <br>&nbsp;&nbsp; punctuation: remove punctuation from text. <br>**:param** <br>&nbsp;&nbsp; numbers: remove digits from text. <br>**:param** <br>&nbsp;&nbsp; lower: lower case all text. <br>**:param** <br>&nbsp;&nbsp; extra_spaces: remove extra spaces - everything beyond one space. <br>**:param** <br>&nbsp;&nbsp; control_characters: remove characters like `\n`, `\t` etc. <br>**:param** <br>&nbsp;&nbsp; tokenize_whitespace: return a list of tokens split on whitespace. <br>**:param** <br>&nbsp;&nbsp; remove_characters: remove defined characters form text. <br>|
|**Returns:**|**:return:** <br>&nbsp;&nbsp; cleaned text or list of tokens of cleaned text.|
# Snippets
This is a very large variety of Python snippets without a certain theme. I put them in the most frequently used ones while keeping a logical order.

View File

@@ -10,6 +10,7 @@ from .array_functions import (pad_array,
from .web_related import (download_from)
from .plot_functions import (plot_array,
plot_confusion_matrix)
from .text_functions import (clean_text)
# installed ftfy to fix any UNICODE problems in text data
from ftfy import fix_text

View File

@@ -0,0 +1,94 @@
# coding=utf-8
# Copyright 2020 George Mihaila.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions that deal with text/string"""
import re
import copy
import string
def clean_text(text, full_clean=False, punctuation=False, numbers=False, lower=False, extra_spaces=False,
control_characters=False, tokenize_whitespace=False, remove_characters=''):
"""Clean text using various techniques.
:param text: string that needs cleaning
:param full_clean: remove: punctuation, numbers, extra space, control characters and lower case
:param punctuation: remove punctuation from text.
:param numbers: remove digits from text.
:param lower: lower case all text.
:param extra_spaces: remove extra spaces - everything beyond one space.
:param control_characters: remove characters like `\n`, `\t` etc.
:param tokenize_whitespace: return a list of tokens split on whitespace.
:param remove_characters: remove defined characters form text.
:return: cleaned text or list of tokens of cleaned text.
"""
if not isinstance(text, str):
# `text` is not type of string
raise ValueError("`text` is not of type str!")
if not isinstance(remove_characters, str):
# remove characters need to be a string
raise ValueError("`remove_characters` needs to be a string!")
# all control characters like `\t` `\n` `\r` etc.
# Stack Overflow: https://stackoverflow.com/a/8115378/11281368
control_characters_list = ''.join([chr(char) for char in range(1, 32)])
# define control characters table
table_control_characters = str.maketrans(dict.fromkeys(control_characters_list))
# remove punctuation table
table_punctuation = str.maketrans(dict.fromkeys(string.punctuation))
# remove numbers table
table_digits = str.maketrans(dict.fromkeys('0123456789'))
# remove certain characters table
table_remove_characters = str.maketrans(dict.fromkeys(remove_characters))
# make a copy of text to make sure it doesn't affect original text
cleaned = copy.deepcopy(text)
if full_clean or punctuation:
# remove punctuation
cleaned = cleaned.translate(table_punctuation)
if full_clean or numbers:
# remove numbers
cleaned = cleaned.translate(table_digits)
if full_clean or extra_spaces:
# remove extra spaces - also removes control characters
# Stack Overflow https://stackoverflow.com/a/2077906/11281368
cleaned = re.sub('\s+', ' ', cleaned).strip()
if full_clean or lower:
# lowercase
cleaned = cleaned.lower()
if control_characters:
# remove control characters
cleaned = cleaned.translate(table_control_characters)
if tokenize_whitespace:
# tokenizes text n whitespace
cleaned = re.split('\s+', cleaned)
if remove_characters:
# remove these characters from text
cleaned = cleaned.translate(table_remove_characters)
return cleaned