mirror of
https://github.com/nikhiljsk/preprocess_nlp.git
synced 2021-10-18 10:21:04 +03:00
173 lines
3.8 KiB
Plaintext
173 lines
3.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import time\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"import vocab_elimination_nlp as ve"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Read Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_excel('data.xlsx')\n",
|
|
"\n",
|
|
"# Make sure there are no Null values and the data type is <str>\n",
|
|
"# The input here is ideally the output from preprocess_nlp (Where the lines are seperated by ' . ', Else change the code in vocab_elimination_nlp.py)\n",
|
|
"df.dropna(subset=['text'], inplace=True)\n",
|
|
"df['text'] = df['text'].astype('str')\n",
|
|
"\n",
|
|
"print(\"Total strings\", len(df['text'].tolist()))\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>various functions</B></I>, which contains various default parameters for stages of processing</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### For getting the frequency and % of words"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# The plot -- X-axis (Count of words) Y-axis (Percentage till that word)\n",
|
|
"# Example -- You take 80%(0.8), you are taking top 2000 words (80% of the words can be defined using only 2000 words)\n",
|
|
"\n",
|
|
"all_words, freqs, percentage_freqs = ve.freq_of_words(df['text'].tolist())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### For Shortlisting top % of words"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"short_words, short_freqs = ve.shortlist_words(all_words, freqs, percentage_freqs, threshold_freq=0.99)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Sequential Processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"st = time.time()\n",
|
|
"\n",
|
|
"replaced_sent = ve.vocab_elimination(df['text'].tolist(), short_words, replace_with='<unk>')\n",
|
|
"\n",
|
|
"print(time.time()-st)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Parallel Processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Order is not maintained\n",
|
|
"# Faster than Sequential if the number of records are greater than a million\n",
|
|
"st = time.time()\n",
|
|
"\n",
|
|
"replaced_sent = ve.async_call_vocab_elimination(df['text'].tolist(), short_words, replace_with='<unk>', n_processes=5)\n",
|
|
"\n",
|
|
"print(time.time()-st)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Write to Disk"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_new = pd.DataFrame({'replaced_text': replaced_sent})\n",
|
|
"df_new.to_excel('trial.xlsx', index=False)\n",
|
|
"df_new.head()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|