Preprocessor is added

This commit is contained in:
AliNajafi
2023-09-29 13:43:14 +03:00
parent 1a355b4ed2
commit 002af1e8fb
4 changed files with 11124 additions and 0 deletions

2
Preprocessor/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .demojize import demojize
from .preprocessor import preprocess

91
Preprocessor/demojize.py Normal file
View File

@@ -0,0 +1,91 @@
import json
import os
dir = os.path.dirname(__file__)
EMOJI_DATA_PATH = os.path.join(dir, "emojis_tr_twitter.json")
with open(EMOJI_DATA_PATH, "r") as f:
emojis = json.load(f)
_SEARCH_TREE = None
def _get_search_tree():
global _SEARCH_TREE
if _SEARCH_TREE is None:
_SEARCH_TREE = {}
for emj in emojis:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
for i, char in enumerate(emj):
if char not in sub_tree:
sub_tree[char] = {}
sub_tree = sub_tree[char]
if i == lastidx:
sub_tree["data"] = emojis[emj]
return _SEARCH_TREE
def demojize(
string,
delimiters=("<emoji> ", " </emoji>"),
language="tr",
version=None,
handle_version=None,
):
if language == "alias":
language = "tr"
_use_aliases = True
else:
_use_aliases = False
tree = _get_search_tree()
result = []
i = 0
length = len(string)
while i < length:
consumed = False
char = string[i]
if char in tree:
j = i + 1
sub_tree = tree[char]
while j < length and string[j] in sub_tree:
sub_tree = sub_tree[string[j]]
j += 1
if "data" in sub_tree:
emj_data = sub_tree["data"]
code_points = string[i:j]
replace_str = None
if version is not None and emj_data["E"] > version:
if callable(handle_version):
emj_data = emj_data.copy()
emj_data["match_start"] = i
emj_data["match_end"] = j
replace_str = handle_version(code_points, emj_data)
elif handle_version is not None:
replace_str = str(handle_version)
else:
replace_str = None
elif language in emj_data:
if _use_aliases and "alias" in emj_data:
replace_str = (
delimiters[0] + emj_data["alias"][0][:-1] + delimiters[1]
)
else:
replace_str = (
delimiters[0] + emj_data[language][1:-1] + delimiters[1]
)
else:
# The emoji exists, but it is not translated, so we keep the emoji
replace_str = code_points
i = j - 1
consumed = True
if replace_str:
result.append(replace_str)
if not consumed and char != "\ufe0e" and char != "\ufe0f":
result.append(char)
i += 1
return "".join(result)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,73 @@
import urllib
import html
import re
from urlextract import URLExtract
from unicodedata import normalize
from .demojize import demojize
def hashtag_handler(text: str):
pattern = r"(#([^\s]+))"
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
def cashtag_handler(text: str):
pattern = r"(\$([^\s]+))"
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
def mention_handler(text: str):
pattern = r"(@([^\s]+))"
return re.sub(pattern, " @user ", text)
url_extractor = URLExtract()
def url_handler(text: str):
urls = list(url_extractor.gen_urls(text))
updated_urls = [url if "http" in url else f"https://{url}" for url in urls]
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
for i in range(len(domains)):
text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
return text
def email_handler(text: str):
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
match = re.findall(pattern, text)
for m in match:
text = text.replace(m, " <email> ").strip()
return text
def emoji_handler(text: str):
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
def normalize_text(text: str):
return normalize("NFC", text)
def preprocess(text: str):
output = html.unescape(text)
output = normalize_text(output)
output = email_handler(output)
output = url_handler(output)
output = hashtag_handler(output)
output = cashtag_handler(output)
output = mention_handler(output)
output = emoji_handler(output)
output = re.sub(r"\s+", " ", output)
output = output.lower()
output = output.strip()
return output
if __name__ == "__main__":
sample_text = ""
preprocessed_text = preprocess(sample_text)
print(preprocessed_text)