mirror of
https://github.com/nikhiljsk/preprocess_nlp.git
synced 2021-10-18 10:21:04 +03:00
240 lines
6.6 KiB
Plaintext
240 lines
6.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import time\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"from preprocess_nlp import preprocess_nlp\n",
|
|
"from preprocess_nlp import async_call_preprocess"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Read Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_excel('data.xlsx', nrows=30000)\n",
|
|
"\n",
|
|
"# Make sure there are no Null values and the data type is <str>\n",
|
|
"df.dropna(inplace=True)\n",
|
|
"df['body'] = df['body'].astype('str')\n",
|
|
"\n",
|
|
"print(\"Total strings\", len(df['body'].tolist()))\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Define Stages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Default Stages\n",
|
|
"stages = {'remove_tags_nonascii': True, \n",
|
|
" 'lower_case': True,\n",
|
|
" 'expand_contractions': False,\n",
|
|
" 'remove_escape_chars': True,\n",
|
|
" 'remove_punctuation': True,\n",
|
|
" 'remove_stopwords': False,\n",
|
|
" 'remove_numbers': True,\n",
|
|
" 'lemmatize': False,\n",
|
|
" 'stemming': False,\n",
|
|
" 'min_word_len': 2}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Sequential Processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>preprocess_nlp</B></I>, which contains various default parameters for stages of processing</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))\n",
|
|
"processed_text_seq = preprocess_nlp(df['body'].tolist(), stages)\n",
|
|
"\n",
|
|
"print(\"Time Elapsed:\", time.time()-start_time)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Parallel Processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>async_call_preprocess</B></I>, which contains various default parameters for stages of processing</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Order is not maintained\n",
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))\n",
|
|
"processed_text_par = async_call_preprocess(df['body'].tolist(), stages, 2)\n",
|
|
"\n",
|
|
"print(\"Time Elapsed:\", time.time()-start_time)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Write to Disk"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_new = pd.DataFrame({'id': df['id'].tolist(), 'processed_text': processed_text})\n",
|
|
"df_new.to_excel('processed.xlsx', index=False)\n",
|
|
"df_new.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<hr>\n",
|
|
"<B><I>IGNORE - Trials for Multi-Thread</I></B><br>\n",
|
|
"<font color='purple'>Turns out processes are faster and run simultaneously allowing parallel processing. Threads are better for I/O sequences</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from multiprocessing.pool import ThreadPool\n",
|
|
"\n",
|
|
"def async_call_preprocess(strings, stages, n_processes=3):\n",
|
|
" \"\"\"\n",
|
|
" Function to create async threads for faster processing. Automatically creates threads and assigns data to each thread call\n",
|
|
" \n",
|
|
" :param strings: A list of strings to be preprocessed\n",
|
|
" :param stages: A dictionary with keys as stages and values as Boolean/Integer. Can be used to customize the stages in preprocessing\n",
|
|
" :param n_processes: Integer value of number of threads to be created\n",
|
|
" (Default parameters for stages)\n",
|
|
" {'remove_tags_nonascii': True, 'lower_case': True,'expand_contractions': False, 'remove_punctuation': True, 'remove_escape_chars': True, 'remove_stopwords': False, 'remove_numbers': True, 'lemmatize': False, 'stemming': False, 'min_word_len': 2\n",
|
|
" \n",
|
|
" <Returns a list of preprocessed strings, aggregated from threads>\n",
|
|
" \"\"\"\n",
|
|
" pool = ThreadPool(processes=n_processes)\n",
|
|
" \n",
|
|
" # Note the start time\n",
|
|
" start_time = time.time()\n",
|
|
" \n",
|
|
" # Calculate the indices of strings to be passed to multiple processes\n",
|
|
" ranges = calculate_ranges(len(strings), n_processes)\n",
|
|
" print(ranges)\n",
|
|
" \n",
|
|
" # Create processes and then pass data\n",
|
|
" process_dict = dict()\n",
|
|
" for i in range(len(ranges)-1):\n",
|
|
" string_set = strings[ranges[i] : ranges[i+1]]\n",
|
|
" process_dict[i] = pool.apply_async(preprocess_nlp, (string_set, stages)) # tuple of args for foo\n",
|
|
" \n",
|
|
" # Join the results\n",
|
|
" processed_strings = list()\n",
|
|
" for i in range(len(ranges)-1):\n",
|
|
" processed_strings.append(process_dict[i].get())\n",
|
|
" \n",
|
|
" for i in range(len(ranges)-1):\n",
|
|
" print(len(process_dict[i].get()))\n",
|
|
" \n",
|
|
" print(\"Time Elapsed:\", time.time()-start_time)\n",
|
|
" return processed_strings"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|