mirror of
https://github.com/nikhiljsk/preprocess_nlp.git
synced 2021-10-18 10:21:04 +03:00
remove_files
This commit is contained in:
@@ -1,237 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import spacy\n",
|
||||
"import nltk\n",
|
||||
"import yake\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def remove_duplicates(ele_list):\n",
|
||||
" new_list = []\n",
|
||||
" for ele in ele_list:\n",
|
||||
" if ele not in new_list:\n",
|
||||
" new_list.append(ele)\n",
|
||||
" return new_list\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def append_ngrams(noun_list, n_size=2):\n",
|
||||
" new_list = list()\n",
|
||||
" for sentence in noun_list:\n",
|
||||
" new_list.append(','.join([x+'_'+y for x,y in list(nltk.ngrams(sentence.split(','), n_size))]))\n",
|
||||
" new_list = [noun_list[i] + ',' + new_list[i] for i in range(len(noun_list))]\n",
|
||||
" return new_list\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def parse_doc(nlp, text):\n",
|
||||
" return nlp(text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_noun(doc):\n",
|
||||
" noun_list = []\n",
|
||||
" for word in doc:\n",
|
||||
" if word.pos_ in ['PROPN', 'NOUN']:\n",
|
||||
" noun_list.append(word.text)\n",
|
||||
" noun_list = remove_duplicates(noun_list)\n",
|
||||
" return \",\".join(noun_list)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_verb(doc):\n",
|
||||
" verb_list = []\n",
|
||||
" for word in doc:\n",
|
||||
" if word.pos_ in ['VERB']:\n",
|
||||
" verb_list.append(word.text)\n",
|
||||
" verb_list = remove_duplicates(verb_list)\n",
|
||||
" return \",\".join(verb_list)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_ner(doc):\n",
|
||||
" ner_list = []\n",
|
||||
" for ent in doc.ents:\n",
|
||||
" ner_list.append(ent.text)\n",
|
||||
" ner_list = remove_duplicates(ner_list)\n",
|
||||
" return \",\".join(ner_list)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_keywords(docs):\n",
|
||||
" language = \"en\"\n",
|
||||
" max_ngram_size = 3\n",
|
||||
" deduplication_thresold = 0.9\n",
|
||||
" deduplication_algo = 'seqm'\n",
|
||||
" numOfKeywords = 1000\n",
|
||||
" \n",
|
||||
" list_of_keys = list()\n",
|
||||
" custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, top=numOfKeywords, features=None)\n",
|
||||
" \n",
|
||||
" for loc, each_article in enumerate(docs):\n",
|
||||
" keywords = custom_kw_extractor.extract_keywords(each_article)\n",
|
||||
" temp1 = list()\n",
|
||||
" for i, j in keywords:\n",
|
||||
" temp1.append(j)\n",
|
||||
" list_of_keys.append(\", \".join(temp1))\n",
|
||||
" return list_of_keys\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_features(docs, stages):\n",
|
||||
" default_stages = {\n",
|
||||
" 'nouns': True,\n",
|
||||
" 'verbs': True,\n",
|
||||
" 'noun_phrases': False,\n",
|
||||
" 'keywords': False,\n",
|
||||
" 'ner': True, \n",
|
||||
" }\n",
|
||||
" default_stages = default_stages.update(stages)\n",
|
||||
" \n",
|
||||
" t = time.time()\n",
|
||||
" noun_chunks = list()\n",
|
||||
" # verbs_list = list()\n",
|
||||
" nlp = spacy.load('en_core_web_sm')\n",
|
||||
" # ners_list = list()\n",
|
||||
" # nouns_list = list()\n",
|
||||
"\n",
|
||||
"# for text in docs:\n",
|
||||
"# doc = parse_doc(nlp, text)\n",
|
||||
" # verbs_list.append(get_verb(doc))\n",
|
||||
" # ners_list.append(get_ner(doc))\n",
|
||||
" # nouns_list.append(get_noun(doc))\n",
|
||||
"# noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))\n",
|
||||
" \n",
|
||||
" for doc in nlp.pipe(docs, disable=['ner']):\n",
|
||||
" # nouns_list.append(get_noun(doc))\n",
|
||||
" # verbs_list.append(get_verb(doc))\n",
|
||||
" noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))\n",
|
||||
"\n",
|
||||
" # keywords = [\",\".join(filter(None, [x, y])) for x, y in zip(noun_chunks, verbs_list)]\n",
|
||||
" # keywords = append_ngrams(nouns_list, 2)\n",
|
||||
" # keywords = noun_chunks\n",
|
||||
" yake_keywords = get_keywords(docs)\n",
|
||||
" keywords = [\",\".join(filter(None, [x, y])) for x, y in zip(noun_chunks, yake_keywords)]\n",
|
||||
" keywords = [','.join(set(x.split(','))) for x in keywords]\n",
|
||||
" print(\"Time elapsed for Keyword Extraction:\", time.time() - t)\n",
|
||||
" return keywords"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Time elapsed for Keyword Extraction: 0.4613072872161865\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Time,this space,Keywords Extraction, elapsed for keywords, time elapsed, keywords, time, elapsed, extraction, space,keywords extraction, defined',\n",
|
||||
" 'you,what,me']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = ['Time elapsed for Keywords Extraction would be defined here in this space', 'Also, let me know what you think of this']\n",
|
||||
"get_features(docs, {})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Time time NOUN NN nsubjpass Xxxx True False\n",
|
||||
"elapsed elapse VERB VBN acl xxxx True False\n",
|
||||
"for for ADP IN prep xxx True True\n",
|
||||
"Keywords Keywords PROPN NNP compound Xxxxx True False\n",
|
||||
"Extraction Extraction PROPN NNP pobj Xxxxx True False\n",
|
||||
"would would VERB MD aux xxxx True True\n",
|
||||
"be be AUX VB auxpass xx True True\n",
|
||||
"defined define VERB VBN ROOT xxxx True False\n",
|
||||
"here here ADV RB advmod xxxx True True\n",
|
||||
"in in ADP IN prep xx True True\n",
|
||||
"this this DET DT det xxxx True True\n",
|
||||
"space space NOUN NN pobj xxxx True False\n",
|
||||
"Also also ADV RB advmod Xxxx True True\n",
|
||||
", , PUNCT , punct , False False\n",
|
||||
"let let VERB VB ROOT xxx True False\n",
|
||||
"me -PRON- PRON PRP nsubj xx True True\n",
|
||||
"know know VERB VB ccomp xxxx True False\n",
|
||||
"what what PRON WP dobj xxxx True True\n",
|
||||
"you -PRON- PRON PRP nsubj xxx True True\n",
|
||||
"think think VERB VBP ccomp xxxx True False\n",
|
||||
"of of ADP IN prep xx True True\n",
|
||||
"this this DET DT pobj xxxx True True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nlp = spacy.load('en_core_web_sm')\n",
|
||||
"for doc in docs:\n",
|
||||
" da = nlp(doc)\n",
|
||||
" for token in da:\n",
|
||||
" print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
|
||||
" token.shape_, token.is_alpha, token.is_stop)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Updates "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"* Feature extractions like Nouns, Verbs, Adjectives, Numbers, Noun Phrases, NERs, Keywords\n",
|
||||
"* Vectorization tools like TF-IDF, GloVe, Word2Vec, Bag of Words"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,239 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"from preprocess_nlp import preprocess_nlp\n",
|
||||
"from preprocess_nlp import async_call_preprocess"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Read Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_excel('data.xlsx', nrows=30000)\n",
|
||||
"\n",
|
||||
"# Make sure there are no Null values and the data type is <str>\n",
|
||||
"df.dropna(inplace=True)\n",
|
||||
"df['body'] = df['body'].astype('str')\n",
|
||||
"\n",
|
||||
"print(\"Total strings\", len(df['body'].tolist()))\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Define Stages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Default Stages\n",
|
||||
"stages = {'remove_tags_nonascii': True, \n",
|
||||
" 'lower_case': True,\n",
|
||||
" 'expand_contractions': False,\n",
|
||||
" 'remove_escape_chars': True,\n",
|
||||
" 'remove_punctuation': True,\n",
|
||||
" 'remove_stopwords': False,\n",
|
||||
" 'remove_numbers': True,\n",
|
||||
" 'lemmatize': False,\n",
|
||||
" 'stemming': False,\n",
|
||||
" 'min_word_len': 2}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Sequential Processing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>preprocess_nlp</B></I>, which contains various default parameters for stages of processing</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"start_time = time.time()\n",
|
||||
"\n",
|
||||
"# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))\n",
|
||||
"processed_text_seq = preprocess_nlp(df['body'].tolist(), stages)\n",
|
||||
"\n",
|
||||
"print(\"Time Elapsed:\", time.time()-start_time)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Parallel Processing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>async_call_preprocess</B></I>, which contains various default parameters for stages of processing</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Order is not maintained\n",
|
||||
"start_time = time.time()\n",
|
||||
"\n",
|
||||
"# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))\n",
|
||||
"processed_text_par = async_call_preprocess(df['body'].tolist(), stages, 2)\n",
|
||||
"\n",
|
||||
"print(\"Time Elapsed:\", time.time()-start_time)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Write to Disk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_new = pd.DataFrame({'id': df['id'].tolist(), 'processed_text': processed_text})\n",
|
||||
"df_new.to_excel('processed.xlsx', index=False)\n",
|
||||
"df_new.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<hr>\n",
|
||||
"<B><I>IGNORE - Trials for Multi-Thread</I></B><br>\n",
|
||||
"<font color='purple'>Turns out processes are faster and run simultaneously allowing parallel processing. Threads are better for I/O sequences</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from multiprocessing.pool import ThreadPool\n",
|
||||
"\n",
|
||||
"def async_call_preprocess(strings, stages, n_processes=3):\n",
|
||||
" \"\"\"\n",
|
||||
" Function to create async threads for faster processing. Automatically creates threads and assigns data to each thread call\n",
|
||||
" \n",
|
||||
" :param strings: A list of strings to be preprocessed\n",
|
||||
" :param stages: A dictionary with keys as stages and values as Boolean/Integer. Can be used to customize the stages in preprocessing\n",
|
||||
" :param n_processes: Integer value of number of threads to be created\n",
|
||||
" (Default parameters for stages)\n",
|
||||
" {'remove_tags_nonascii': True, 'lower_case': True,'expand_contractions': False, 'remove_punctuation': True, 'remove_escape_chars': True, 'remove_stopwords': False, 'remove_numbers': True, 'lemmatize': False, 'stemming': False, 'min_word_len': 2\n",
|
||||
" \n",
|
||||
" <Returns a list of preprocessed strings, aggregated from threads>\n",
|
||||
" \"\"\"\n",
|
||||
" pool = ThreadPool(processes=n_processes)\n",
|
||||
" \n",
|
||||
" # Note the start time\n",
|
||||
" start_time = time.time()\n",
|
||||
" \n",
|
||||
" # Calculate the indices of strings to be passed to multiple processes\n",
|
||||
" ranges = calculate_ranges(len(strings), n_processes)\n",
|
||||
" print(ranges)\n",
|
||||
" \n",
|
||||
" # Create processes and then pass data\n",
|
||||
" process_dict = dict()\n",
|
||||
" for i in range(len(ranges)-1):\n",
|
||||
" string_set = strings[ranges[i] : ranges[i+1]]\n",
|
||||
" process_dict[i] = pool.apply_async(preprocess_nlp, (string_set, stages)) # tuple of args for foo\n",
|
||||
" \n",
|
||||
" # Join the results\n",
|
||||
" processed_strings = list()\n",
|
||||
" for i in range(len(ranges)-1):\n",
|
||||
" processed_strings.append(process_dict[i].get())\n",
|
||||
" \n",
|
||||
" for i in range(len(ranges)-1):\n",
|
||||
" print(len(process_dict[i].get()))\n",
|
||||
" \n",
|
||||
" print(\"Time Elapsed:\", time.time()-start_time)\n",
|
||||
" return processed_strings"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user