preprocess_nlp-multiprocess/vocab_elimination/Vocab_Elimination_Example_Notebook.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import pandas as pd\n",
    "\n",
    "import vocab_elimination_nlp as ve"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Read Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel('data.xlsx')\n",
    "\n",
    "# Make sure there are no Null values and the data type is <str>\n",
    "# The input here is ideally the output from preprocess_nlp (Where the lines are seperated by ' . ', Else change the code in vocab_elimination_nlp.py)\n",
    "df.dropna(subset=['text'], inplace=True)\n",
    "df['text'] = df['text'].astype('str')\n",
    "\n",
    "print(\"Total strings\", len(df['text'].tolist()))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>various functions</B></I>, which contains various default parameters for stages of processing</font>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### For getting the frequency and % of words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The plot -- X-axis (Count of words) Y-axis (Percentage till that word)\n",
    "# Example -- You take 80%(0.8), you are taking top 2000 words (80% of the words can be defined using only 2000 words)\n",
    "\n",
    "all_words, freqs, percentage_freqs = ve.freq_of_words(df['text'].tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### For Shortlisting top % of words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "short_words, short_freqs = ve.shortlist_words(all_words, freqs, percentage_freqs, threshold_freq=0.99)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Sequential Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "st = time.time()\n",
    "\n",
    "replaced_sent = ve.vocab_elimination(df['text'].tolist(), short_words, replace_with='<unk>')\n",
    "\n",
    "print(time.time()-st)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Parallel Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Order is not maintained\n",
    "# Faster than Sequential if the number of records are greater than a million\n",
    "st = time.time()\n",
    "\n",
    "replaced_sent = ve.async_call_vocab_elimination(df['text'].tolist(), short_words, replace_with='<unk>', n_processes=5)\n",
    "\n",
    "print(time.time()-st)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Write to Disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_new = pd.DataFrame({'replaced_text': replaced_sent})\n",
    "df_new.to_excel('trial.xlsx', index=False)\n",
    "df_new.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}