Move translation file to dataset

2021-05-12 18:32:23 +03:00 · 2020-04-14 16:23:18 +05:30
parent 0def1cc6e2
commit c12e146c83
3 changed files with 166 additions and 2030 deletions
--- a/Dataset/README.md
+++ b/Dataset/README.md
@@ -35,4 +35,5 @@ In the above datasets, some of them contain multiple labels for the text such as

 ## Instructions for getting the datasets
 1. Download the datasets from the above sources and place it in the subfolder `Dataset/full_data`
-2. Use the ids given in `ID Mapping` folder for splitting the datasets into train, val and test splits. Use the file `Stratified Split.ipynb` for doing the splits. 
+2. Use the `Translation.ipynb ` to translate the datasets into english
+3. Use the ids given in `ID Mapping` folder for splitting the datasets into train, val and test splits. Use the file `Stratified Split.ipynb` for doing the splits. 
--- a/Dataset/Translation.ipynb
+++ b/Dataset/Translation.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Necessary imports\n",
+    "import re\n",
+    "import emoji\n",
+    "from gtrans import translate_text, translate_html\n",
+    "import random\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from multiprocessing import  Pool\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to remove emojis in text, since these conflict during translation\n",
+    "def remove_emoji(text):\n",
+    "    return emoji.get_emoji_regexp().sub(u'', text)\n",
+    "\n",
+    "\n",
+    "def approximate_emoji_insert(string, index,char):\n",
+    "    if(index<(len(string)-1)):\n",
+    "        \n",
+    "        while(string[index]!=' ' ):\n",
+    "            if(index+1==len(string)):\n",
+    "                break\n",
+    "            index=index+1\n",
+    "        return string[:index] + ' '+char + ' ' + string[index:]\n",
+    "    else:\n",
+    "        return string + ' '+char + ' ' \n",
+    "    \n",
+    "\n",
+    "\n",
+    "def extract_emojis(str1):\n",
+    "    try:\n",
+    "        return [(c,i) for i,c in enumerate(str1) if c in emoji.UNICODE_EMOJI]\n",
+    "    except AttributeError:\n",
+    "        return []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use multiprocessing framework for speeding up translation process\n",
+    "def parallelize_dataframe(df, func, n_cores=4):\n",
+    "    '''parallelize the dataframe'''\n",
+    "    df_split = np.array_split(df, n_cores)\n",
+    "    pool = Pool(n_cores)\n",
+    "    df = pd.concat(pool.map(func, df_split))\n",
+    "    pool.close()\n",
+    "    pool.join()\n",
+    "    return df\n",
+    "\n",
+    "# Main function for translation\n",
+    "def translate(x,lang):\n",
+    "    '''provide the translation given text and the language'''\n",
+    "    #x=preprocess_lib.preprocess_multi(x,lang,multiple_sentences=False,stop_word_remove=False, tokenize_word=False, tokenize_sentence=False)\n",
+    "    emoji_list=extract_emojis(x)\n",
+    "    try:\n",
+    "        translated_text=translate_text(x,lang,'en')\n",
+    "    except:\n",
+    "        translated_text=x\n",
+    "    for ele in emoji_list:\n",
+    "        translated_text=approximate_emoji_insert(translated_text, ele[1],ele[0])\n",
+    "    return translated_text\n",
+    "\n",
+    "def add_features(df):\n",
+    "    '''adding new features to the dataframe'''\n",
+    "    translated_text=[]\n",
+    "    for index,row in df.iterrows():\n",
+    "        if(row['lang']in ['en','unk']):\n",
+    "            translated_text.append(row['text'])\n",
+    "        else:\n",
+    "            translated_text.append(translate(row['text'],row['lang']))    \n",
+    "    df[\"translated\"]=translated_text\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob \n",
+    "train_files = glob.glob('train/*.csv')\n",
+    "test_files = glob.glob('test/*.csv')\n",
+    "val_files = glob.glob('val/*.csv')\n",
+    "files= train_files+test_files+val_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm_notebook\n",
+    "size=10\n",
+    "\n",
+    "for file in files:\n",
+    "    wp_data=pd.read_csv(file)\n",
+    "    list_df=[]\n",
+    "    for i in tqdm_notebook(range(0,100,size)):\n",
+    "                print(i,\"_iteration\")\n",
+    "                df_new=parallelize_dataframe(wp_data[i:i+size],add_features,n_cores=20)\n",
+    "                list_df.append(df_new)\n",
+    "    df_translated=pd.concat(list_df,ignore_index=True)\n",
+    "    file_name='translated'+file\n",
+    "    df_translated.to_csv(file_name)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/Translation/Translation.ipynb
+++ b/Translation/Translation.ipynb