mirror of
https://github.com/ViralLab/TurkishBERTweet.git
synced 2023-12-19 18:19:59 +03:00
Preprocessor is added
This commit is contained in:
2
Preprocessor/__init__.py
Normal file
2
Preprocessor/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .demojize import demojize
|
||||
from .preprocessor import preprocess
|
||||
91
Preprocessor/demojize.py
Normal file
91
Preprocessor/demojize.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
dir = os.path.dirname(__file__)
|
||||
EMOJI_DATA_PATH = os.path.join(dir, "emojis_tr_twitter.json")
|
||||
|
||||
with open(EMOJI_DATA_PATH, "r") as f:
|
||||
emojis = json.load(f)
|
||||
|
||||
_SEARCH_TREE = None
|
||||
|
||||
|
||||
def _get_search_tree():
|
||||
global _SEARCH_TREE
|
||||
if _SEARCH_TREE is None:
|
||||
_SEARCH_TREE = {}
|
||||
for emj in emojis:
|
||||
sub_tree = _SEARCH_TREE
|
||||
lastidx = len(emj) - 1
|
||||
for i, char in enumerate(emj):
|
||||
if char not in sub_tree:
|
||||
sub_tree[char] = {}
|
||||
sub_tree = sub_tree[char]
|
||||
if i == lastidx:
|
||||
sub_tree["data"] = emojis[emj]
|
||||
|
||||
return _SEARCH_TREE
|
||||
|
||||
|
||||
def demojize(
|
||||
string,
|
||||
delimiters=("<emoji> ", " </emoji>"),
|
||||
language="tr",
|
||||
version=None,
|
||||
handle_version=None,
|
||||
):
|
||||
if language == "alias":
|
||||
language = "tr"
|
||||
_use_aliases = True
|
||||
else:
|
||||
_use_aliases = False
|
||||
tree = _get_search_tree()
|
||||
result = []
|
||||
i = 0
|
||||
length = len(string)
|
||||
while i < length:
|
||||
consumed = False
|
||||
char = string[i]
|
||||
if char in tree:
|
||||
j = i + 1
|
||||
sub_tree = tree[char]
|
||||
while j < length and string[j] in sub_tree:
|
||||
sub_tree = sub_tree[string[j]]
|
||||
j += 1
|
||||
if "data" in sub_tree:
|
||||
emj_data = sub_tree["data"]
|
||||
code_points = string[i:j]
|
||||
replace_str = None
|
||||
if version is not None and emj_data["E"] > version:
|
||||
if callable(handle_version):
|
||||
emj_data = emj_data.copy()
|
||||
emj_data["match_start"] = i
|
||||
emj_data["match_end"] = j
|
||||
replace_str = handle_version(code_points, emj_data)
|
||||
elif handle_version is not None:
|
||||
replace_str = str(handle_version)
|
||||
else:
|
||||
replace_str = None
|
||||
elif language in emj_data:
|
||||
if _use_aliases and "alias" in emj_data:
|
||||
replace_str = (
|
||||
delimiters[0] + emj_data["alias"][0][:-1] + delimiters[1]
|
||||
)
|
||||
else:
|
||||
replace_str = (
|
||||
delimiters[0] + emj_data[language][1:-1] + delimiters[1]
|
||||
)
|
||||
else:
|
||||
# The emoji exists, but it is not translated, so we keep the emoji
|
||||
replace_str = code_points
|
||||
|
||||
i = j - 1
|
||||
consumed = True
|
||||
if replace_str:
|
||||
result.append(replace_str)
|
||||
|
||||
if not consumed and char != "\ufe0e" and char != "\ufe0f":
|
||||
result.append(char)
|
||||
i += 1
|
||||
|
||||
return "".join(result)
|
||||
10958
Preprocessor/emojis_tr_twitter.json
Normal file
10958
Preprocessor/emojis_tr_twitter.json
Normal file
File diff suppressed because it is too large
Load Diff
73
Preprocessor/preprocessor.py
Normal file
73
Preprocessor/preprocessor.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import urllib
|
||||
import html
|
||||
import re
|
||||
|
||||
from urlextract import URLExtract
|
||||
from unicodedata import normalize
|
||||
|
||||
from .demojize import demojize
|
||||
|
||||
|
||||
def hashtag_handler(text: str):
|
||||
pattern = r"(#([^\s]+))"
|
||||
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
|
||||
|
||||
|
||||
def cashtag_handler(text: str):
|
||||
pattern = r"(\$([^\s]+))"
|
||||
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
|
||||
|
||||
|
||||
def mention_handler(text: str):
|
||||
pattern = r"(@([^\s]+))"
|
||||
return re.sub(pattern, " @user ", text)
|
||||
|
||||
|
||||
url_extractor = URLExtract()
|
||||
|
||||
|
||||
def url_handler(text: str):
|
||||
urls = list(url_extractor.gen_urls(text))
|
||||
updated_urls = [url if "http" in url else f"https://{url}" for url in urls]
|
||||
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
|
||||
for i in range(len(domains)):
|
||||
text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
|
||||
return text
|
||||
|
||||
|
||||
def email_handler(text: str):
|
||||
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
|
||||
match = re.findall(pattern, text)
|
||||
for m in match:
|
||||
text = text.replace(m, " <email> ").strip()
|
||||
return text
|
||||
|
||||
|
||||
def emoji_handler(text: str):
|
||||
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
|
||||
|
||||
|
||||
def normalize_text(text: str):
|
||||
return normalize("NFC", text)
|
||||
|
||||
|
||||
def preprocess(text: str):
|
||||
output = html.unescape(text)
|
||||
output = normalize_text(output)
|
||||
output = email_handler(output)
|
||||
output = url_handler(output)
|
||||
output = hashtag_handler(output)
|
||||
output = cashtag_handler(output)
|
||||
output = mention_handler(output)
|
||||
output = emoji_handler(output)
|
||||
output = re.sub(r"\s+", " ", output)
|
||||
output = output.lower()
|
||||
output = output.strip()
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sample_text = ""
|
||||
preprocessed_text = preprocess(sample_text)
|
||||
print(preprocessed_text)
|
||||
Reference in New Issue
Block a user