Move translation file to dataset

This commit is contained in:
Saketh
2020-04-14 16:23:18 +05:30
parent 0def1cc6e2
commit c12e146c83
3 changed files with 166 additions and 2030 deletions

View File

@@ -35,4 +35,5 @@ In the above datasets, some of them contain multiple labels for the text such as
## Instructions for getting the datasets
1. Download the datasets from the above sources and place it in the subfolder `Dataset/full_data`
2. Use the ids given in `ID Mapping` folder for splitting the datasets into train, val and test splits. Use the file `Stratified Split.ipynb` for doing the splits.
2. Use the `Translation.ipynb ` to translate the datasets into english
3. Use the ids given in `ID Mapping` folder for splitting the datasets into train, val and test splits. Use the file `Stratified Split.ipynb` for doing the splits.

164
Dataset/Translation.ipynb Normal file
View File

@@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Necessary imports\n",
"import re\n",
"import emoji\n",
"from gtrans import translate_text, translate_html\n",
"import random\n",
"import pandas as pd\n",
"import numpy as np\n",
"from multiprocessing import Pool\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Function to remove emojis in text, since these conflict during translation\n",
"def remove_emoji(text):\n",
" return emoji.get_emoji_regexp().sub(u'', text)\n",
"\n",
"\n",
"def approximate_emoji_insert(string, index,char):\n",
" if(index<(len(string)-1)):\n",
" \n",
" while(string[index]!=' ' ):\n",
" if(index+1==len(string)):\n",
" break\n",
" index=index+1\n",
" return string[:index] + ' '+char + ' ' + string[index:]\n",
" else:\n",
" return string + ' '+char + ' ' \n",
" \n",
"\n",
"\n",
"def extract_emojis(str1):\n",
" try:\n",
" return [(c,i) for i,c in enumerate(str1) if c in emoji.UNICODE_EMOJI]\n",
" except AttributeError:\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use multiprocessing framework for speeding up translation process\n",
"def parallelize_dataframe(df, func, n_cores=4):\n",
" '''parallelize the dataframe'''\n",
" df_split = np.array_split(df, n_cores)\n",
" pool = Pool(n_cores)\n",
" df = pd.concat(pool.map(func, df_split))\n",
" pool.close()\n",
" pool.join()\n",
" return df\n",
"\n",
"# Main function for translation\n",
"def translate(x,lang):\n",
" '''provide the translation given text and the language'''\n",
" #x=preprocess_lib.preprocess_multi(x,lang,multiple_sentences=False,stop_word_remove=False, tokenize_word=False, tokenize_sentence=False)\n",
" emoji_list=extract_emojis(x)\n",
" try:\n",
" translated_text=translate_text(x,lang,'en')\n",
" except:\n",
" translated_text=x\n",
" for ele in emoji_list:\n",
" translated_text=approximate_emoji_insert(translated_text, ele[1],ele[0])\n",
" return translated_text\n",
"\n",
"def add_features(df):\n",
" '''adding new features to the dataframe'''\n",
" translated_text=[]\n",
" for index,row in df.iterrows():\n",
" if(row['lang']in ['en','unk']):\n",
" translated_text.append(row['text'])\n",
" else:\n",
" translated_text.append(translate(row['text'],row['lang'])) \n",
" df[\"translated\"]=translated_text\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import glob \n",
"train_files = glob.glob('train/*.csv')\n",
"test_files = glob.glob('test/*.csv')\n",
"val_files = glob.glob('val/*.csv')\n",
"files= train_files+test_files+val_files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm_notebook\n",
"size=10\n",
"\n",
"for file in files:\n",
" wp_data=pd.read_csv(file)\n",
" list_df=[]\n",
" for i in tqdm_notebook(range(0,100,size)):\n",
" print(i,\"_iteration\")\n",
" df_new=parallelize_dataframe(wp_data[i:i+size],add_features,n_cores=20)\n",
" list_df.append(df_new)\n",
" df_translated=pd.concat(list_df,ignore_index=True)\n",
" file_name='translated'+file\n",
" df_translated.to_csv(file_name)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff