mirror of
https://github.com/hate-alert/DE-LIMIT.git
synced 2021-05-12 18:32:23 +03:00
Move translation file to dataset
This commit is contained in:
@@ -35,4 +35,5 @@ In the above datasets, some of them contain multiple labels for the text such as
|
||||
|
||||
## Instructions for getting the datasets
|
||||
1. Download the datasets from the above sources and place it in the subfolder `Dataset/full_data`
|
||||
2. Use the ids given in `ID Mapping` folder for splitting the datasets into train, val and test splits. Use the file `Stratified Split.ipynb` for doing the splits.
|
||||
2. Use the `Translation.ipynb ` to translate the datasets into english
|
||||
3. Use the ids given in `ID Mapping` folder for splitting the datasets into train, val and test splits. Use the file `Stratified Split.ipynb` for doing the splits.
|
||||
164
Dataset/Translation.ipynb
Normal file
164
Dataset/Translation.ipynb
Normal file
@@ -0,0 +1,164 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Necessary imports\n",
|
||||
"import re\n",
|
||||
"import emoji\n",
|
||||
"from gtrans import translate_text, translate_html\n",
|
||||
"import random\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from multiprocessing import Pool\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Function to remove emojis in text, since these conflict during translation\n",
|
||||
"def remove_emoji(text):\n",
|
||||
" return emoji.get_emoji_regexp().sub(u'', text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def approximate_emoji_insert(string, index,char):\n",
|
||||
" if(index<(len(string)-1)):\n",
|
||||
" \n",
|
||||
" while(string[index]!=' ' ):\n",
|
||||
" if(index+1==len(string)):\n",
|
||||
" break\n",
|
||||
" index=index+1\n",
|
||||
" return string[:index] + ' '+char + ' ' + string[index:]\n",
|
||||
" else:\n",
|
||||
" return string + ' '+char + ' ' \n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_emojis(str1):\n",
|
||||
" try:\n",
|
||||
" return [(c,i) for i,c in enumerate(str1) if c in emoji.UNICODE_EMOJI]\n",
|
||||
" except AttributeError:\n",
|
||||
" return []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Use multiprocessing framework for speeding up translation process\n",
|
||||
"def parallelize_dataframe(df, func, n_cores=4):\n",
|
||||
" '''parallelize the dataframe'''\n",
|
||||
" df_split = np.array_split(df, n_cores)\n",
|
||||
" pool = Pool(n_cores)\n",
|
||||
" df = pd.concat(pool.map(func, df_split))\n",
|
||||
" pool.close()\n",
|
||||
" pool.join()\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"# Main function for translation\n",
|
||||
"def translate(x,lang):\n",
|
||||
" '''provide the translation given text and the language'''\n",
|
||||
" #x=preprocess_lib.preprocess_multi(x,lang,multiple_sentences=False,stop_word_remove=False, tokenize_word=False, tokenize_sentence=False)\n",
|
||||
" emoji_list=extract_emojis(x)\n",
|
||||
" try:\n",
|
||||
" translated_text=translate_text(x,lang,'en')\n",
|
||||
" except:\n",
|
||||
" translated_text=x\n",
|
||||
" for ele in emoji_list:\n",
|
||||
" translated_text=approximate_emoji_insert(translated_text, ele[1],ele[0])\n",
|
||||
" return translated_text\n",
|
||||
"\n",
|
||||
"def add_features(df):\n",
|
||||
" '''adding new features to the dataframe'''\n",
|
||||
" translated_text=[]\n",
|
||||
" for index,row in df.iterrows():\n",
|
||||
" if(row['lang']in ['en','unk']):\n",
|
||||
" translated_text.append(row['text'])\n",
|
||||
" else:\n",
|
||||
" translated_text.append(translate(row['text'],row['lang'])) \n",
|
||||
" df[\"translated\"]=translated_text\n",
|
||||
" return df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob \n",
|
||||
"train_files = glob.glob('train/*.csv')\n",
|
||||
"test_files = glob.glob('test/*.csv')\n",
|
||||
"val_files = glob.glob('val/*.csv')\n",
|
||||
"files= train_files+test_files+val_files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tqdm import tqdm_notebook\n",
|
||||
"size=10\n",
|
||||
"\n",
|
||||
"for file in files:\n",
|
||||
" wp_data=pd.read_csv(file)\n",
|
||||
" list_df=[]\n",
|
||||
" for i in tqdm_notebook(range(0,100,size)):\n",
|
||||
" print(i,\"_iteration\")\n",
|
||||
" df_new=parallelize_dataframe(wp_data[i:i+size],add_features,n_cores=20)\n",
|
||||
" list_df.append(df_new)\n",
|
||||
" df_translated=pd.concat(list_df,ignore_index=True)\n",
|
||||
" file_name='translated'+file\n",
|
||||
" df_translated.to_csv(file_name)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user