mirror of
https://github.com/nikhiljsk/preprocess_nlp.git
synced 2021-10-18 10:21:04 +03:00
260 lines
7.5 KiB
Plaintext
260 lines
7.5 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import time\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"from feature_extraction import get_features\n",
|
|
"from feature_extraction import async_call_get_features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Read Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_excel(\"data.xlsx\")\n",
|
|
"\n",
|
|
"# Make sure there are no Null values and the data type is <str>\n",
|
|
"df.dropna(subset=['text'])\n",
|
|
"df['text'] = df['text'].astype('str')\n",
|
|
"\n",
|
|
"# Strings from which features are to be extracted\n",
|
|
"docs = df['text'].tolist()\n",
|
|
"len(docs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>various functions</B></I>, which contains various default parameters for stages of processing</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Define Stages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"stages = {'nouns': True,\n",
|
|
" 'verbs': True,\n",
|
|
" 'adjs': True,\n",
|
|
" 'noun_phrases': False,\n",
|
|
" 'keywords': False,\n",
|
|
" 'ner': False,\n",
|
|
" 'numbers': False,}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Sequential Processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))\n",
|
|
"nouns_list, verbs_list, adjs_list, ners_list, noun_chunks, yake_keywords, numbers_list = \\\n",
|
|
" get_features(docs, stages)\n",
|
|
"\n",
|
|
"print(\"Time Elapsed:\", time.time()-start_time)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Parallel Processing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Input Order is mainted, output is according to the input order.\n",
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))\n",
|
|
"nouns_list2, verbs_list2, adjs_list2, ners_list2, noun_chunks2, yake_keywords2, numbers_list2 = \\\n",
|
|
" async_call_get_features(docs, stages, n_processes=2)\n",
|
|
"\n",
|
|
"print(\"Time Elapsed:\", time.time()-start_time)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Segregate the NERS into ORG, PER, LOC"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"per_list = list()\n",
|
|
"loc_list = list()\n",
|
|
"org_list = list()\n",
|
|
"for each_ner_set in ners_list:\n",
|
|
" per_list.append(each_ner_set.get('PER', ''))\n",
|
|
" loc_list.append(each_ner_set.get('LOC', ''))\n",
|
|
" org_list.append(each_ner_set.get('ORG', ''))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Write to Disk"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Remove the ones that are not extracted from the below code\n",
|
|
"df_features = pd.DataFrame({'id':df['id'].tolist(),\n",
|
|
" 'text': docs,\n",
|
|
" 'nouns': nouns_list,\n",
|
|
" 'verbs': verbs_list,\n",
|
|
" 'adjs':adjs_list,\n",
|
|
" 'noun_phrases':noun_chunks,\n",
|
|
" 'keywords':yake_keywords,\n",
|
|
" 'numbers': numbers_list,\n",
|
|
" 'Person': per_list,\n",
|
|
" 'Organization': org_list,\n",
|
|
" 'Location': loc_list,\n",
|
|
" })\n",
|
|
"\n",
|
|
"df_features.to_excel('trail.xlsx', index=False)\n",
|
|
"df_features.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<hr>\n",
|
|
"<B><I>IGNORE - Trials for Manager instead of Pipe</I></B><br>\n",
|
|
"<font color='purple'>Turns out processes in Manager donot return values in an order. Hence order is not maintained</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def async_call_get_features_manager(strings, stages={}, n_processes=3):\n",
|
|
" \"\"\"\n",
|
|
" Function to create async processes for faster processing. Automatically creates processe and assigns data to each process call.\n",
|
|
" This function uses Manager instead of Pipe, so order is not mainted in this function. IGNORE this function.\n",
|
|
" \n",
|
|
" :param strings: A list of strings to be processed or extracted features from\n",
|
|
" :param stages: Dictionary that contains stages to be executed\n",
|
|
" :param n_processes: Integer value of number of processess to be created\n",
|
|
" \n",
|
|
" <Returns a list of extracted features, 7 list items> \\n\n",
|
|
" \n",
|
|
" (default_stages = {\n",
|
|
" 'nouns': True,\n",
|
|
" 'verbs': True,\n",
|
|
" 'adjs': False,\n",
|
|
" 'noun_phrases': False,\n",
|
|
" 'keywords': False,\n",
|
|
" 'ner': False,\n",
|
|
" 'numbers': False,\n",
|
|
" })\n",
|
|
" \"\"\"\n",
|
|
" # Calculate the indices of strings to be passed to multiple processes\n",
|
|
" ranges = calculate_ranges(len(strings), n_processes)\n",
|
|
"\n",
|
|
" # Create a Job Manager to share a dictionary that could store results of multiple processes \n",
|
|
" jobs = [] \n",
|
|
" manager = multiprocessing.Manager()\n",
|
|
" return_dict = manager.dict()\n",
|
|
"\n",
|
|
" # Start creating processes and pass the records/strings according to the indices generated\n",
|
|
" for i in range(len(ranges)-1):\n",
|
|
" string_set = strings[ranges[i] : ranges[i+1]]\n",
|
|
" p = multiprocessing.Process(target=get_features, args=(string_set, stages, i, return_dict))\n",
|
|
" jobs.append(p)\n",
|
|
" p.start()\n",
|
|
"\n",
|
|
" # Wait for the result of each process\n",
|
|
" for proc in jobs:\n",
|
|
" proc.join()\n",
|
|
" \n",
|
|
" all_list = [[], [], [], [], [], [], []]\n",
|
|
" for k in return_dict.keys():\n",
|
|
" for i, j in enumerate(return_dict[k]):\n",
|
|
" all_list[i] += j\n",
|
|
" \n",
|
|
" return all_list"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|