1
0
mirror of https://github.com/nikhiljsk/preprocess_nlp.git synced 2021-10-18 10:21:04 +03:00

remove_files

This commit is contained in:
nikhiljsk
2020-02-13 15:15:07 +05:30
parent 4524da571f
commit 3f0c6793c0
2 changed files with 0 additions and 476 deletions

View File

@@ -1,237 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import spacy\n",
"import nltk\n",
"import yake\n",
"\n",
"\n",
"def remove_duplicates(ele_list):\n",
" new_list = []\n",
" for ele in ele_list:\n",
" if ele not in new_list:\n",
" new_list.append(ele)\n",
" return new_list\n",
"\n",
"\n",
"def append_ngrams(noun_list, n_size=2):\n",
" new_list = list()\n",
" for sentence in noun_list:\n",
" new_list.append(','.join([x+'_'+y for x,y in list(nltk.ngrams(sentence.split(','), n_size))]))\n",
" new_list = [noun_list[i] + ',' + new_list[i] for i in range(len(noun_list))]\n",
" return new_list\n",
"\n",
"\n",
"def parse_doc(nlp, text):\n",
" return nlp(text)\n",
"\n",
"\n",
"def get_noun(doc):\n",
" noun_list = []\n",
" for word in doc:\n",
" if word.pos_ in ['PROPN', 'NOUN']:\n",
" noun_list.append(word.text)\n",
" noun_list = remove_duplicates(noun_list)\n",
" return \",\".join(noun_list)\n",
"\n",
"\n",
"def get_verb(doc):\n",
" verb_list = []\n",
" for word in doc:\n",
" if word.pos_ in ['VERB']:\n",
" verb_list.append(word.text)\n",
" verb_list = remove_duplicates(verb_list)\n",
" return \",\".join(verb_list)\n",
"\n",
"\n",
"def get_ner(doc):\n",
" ner_list = []\n",
" for ent in doc.ents:\n",
" ner_list.append(ent.text)\n",
" ner_list = remove_duplicates(ner_list)\n",
" return \",\".join(ner_list)\n",
"\n",
"\n",
"def get_keywords(docs):\n",
" language = \"en\"\n",
" max_ngram_size = 3\n",
" deduplication_thresold = 0.9\n",
" deduplication_algo = 'seqm'\n",
" numOfKeywords = 1000\n",
" \n",
" list_of_keys = list()\n",
" custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, top=numOfKeywords, features=None)\n",
" \n",
" for loc, each_article in enumerate(docs):\n",
" keywords = custom_kw_extractor.extract_keywords(each_article)\n",
" temp1 = list()\n",
" for i, j in keywords:\n",
" temp1.append(j)\n",
" list_of_keys.append(\", \".join(temp1))\n",
" return list_of_keys\n",
"\n",
"\n",
"def get_features(docs, stages):\n",
" default_stages = {\n",
" 'nouns': True,\n",
" 'verbs': True,\n",
" 'noun_phrases': False,\n",
" 'keywords': False,\n",
" 'ner': True, \n",
" }\n",
" default_stages = default_stages.update(stages)\n",
" \n",
" t = time.time()\n",
" noun_chunks = list()\n",
" # verbs_list = list()\n",
" nlp = spacy.load('en_core_web_sm')\n",
" # ners_list = list()\n",
" # nouns_list = list()\n",
"\n",
"# for text in docs:\n",
"# doc = parse_doc(nlp, text)\n",
" # verbs_list.append(get_verb(doc))\n",
" # ners_list.append(get_ner(doc))\n",
" # nouns_list.append(get_noun(doc))\n",
"# noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))\n",
" \n",
" for doc in nlp.pipe(docs, disable=['ner']):\n",
" # nouns_list.append(get_noun(doc))\n",
" # verbs_list.append(get_verb(doc))\n",
" noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))\n",
"\n",
" # keywords = [\",\".join(filter(None, [x, y])) for x, y in zip(noun_chunks, verbs_list)]\n",
" # keywords = append_ngrams(nouns_list, 2)\n",
" # keywords = noun_chunks\n",
" yake_keywords = get_keywords(docs)\n",
" keywords = [\",\".join(filter(None, [x, y])) for x, y in zip(noun_chunks, yake_keywords)]\n",
" keywords = [','.join(set(x.split(','))) for x in keywords]\n",
" print(\"Time elapsed for Keyword Extraction:\", time.time() - t)\n",
" return keywords"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed for Keyword Extraction: 0.4613072872161865\n"
]
},
{
"data": {
"text/plain": [
"['Time,this space,Keywords Extraction, elapsed for keywords, time elapsed, keywords, time, elapsed, extraction, space,keywords extraction, defined',\n",
" 'you,what,me']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = ['Time elapsed for Keywords Extraction would be defined here in this space', 'Also, let me know what you think of this']\n",
"get_features(docs, {})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time time NOUN NN nsubjpass Xxxx True False\n",
"elapsed elapse VERB VBN acl xxxx True False\n",
"for for ADP IN prep xxx True True\n",
"Keywords Keywords PROPN NNP compound Xxxxx True False\n",
"Extraction Extraction PROPN NNP pobj Xxxxx True False\n",
"would would VERB MD aux xxxx True True\n",
"be be AUX VB auxpass xx True True\n",
"defined define VERB VBN ROOT xxxx True False\n",
"here here ADV RB advmod xxxx True True\n",
"in in ADP IN prep xx True True\n",
"this this DET DT det xxxx True True\n",
"space space NOUN NN pobj xxxx True False\n",
"Also also ADV RB advmod Xxxx True True\n",
", , PUNCT , punct , False False\n",
"let let VERB VB ROOT xxx True False\n",
"me -PRON- PRON PRP nsubj xx True True\n",
"know know VERB VB ccomp xxxx True False\n",
"what what PRON WP dobj xxxx True True\n",
"you -PRON- PRON PRP nsubj xxx True True\n",
"think think VERB VBP ccomp xxxx True False\n",
"of of ADP IN prep xx True True\n",
"this this DET DT pobj xxxx True True\n"
]
}
],
"source": [
"nlp = spacy.load('en_core_web_sm')\n",
"for doc in docs:\n",
" da = nlp(doc)\n",
" for token in da:\n",
" print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
" token.shape_, token.is_alpha, token.is_stop)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Updates "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* Feature extractions like Nouns, Verbs, Adjectives, Numbers, Noun Phrases, NERs, Keywords\n",
"* Vectorization tools like TF-IDF, GloVe, Word2Vec, Bag of Words"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,239 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import time\n",
"import pandas as pd\n",
"\n",
"from preprocess_nlp import preprocess_nlp\n",
"from preprocess_nlp import async_call_preprocess"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Read Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df = pd.read_excel('data.xlsx', nrows=30000)\n",
"\n",
"# Make sure there are no Null values and the data type is <str>\n",
"df.dropna(inplace=True)\n",
"df['body'] = df['body'].astype('str')\n",
"\n",
"print(\"Total strings\", len(df['body'].tolist()))\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Define Stages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Default Stages\n",
"stages = {'remove_tags_nonascii': True, \n",
" 'lower_case': True,\n",
" 'expand_contractions': False,\n",
" 'remove_escape_chars': True,\n",
" 'remove_punctuation': True,\n",
" 'remove_stopwords': False,\n",
" 'remove_numbers': True,\n",
" 'lemmatize': False,\n",
" 'stemming': False,\n",
" 'min_word_len': 2}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sequential Processing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>preprocess_nlp</B></I>, which contains various default parameters for stages of processing</font>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"start_time = time.time()\n",
"\n",
"# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))\n",
"processed_text_seq = preprocess_nlp(df['body'].tolist(), stages)\n",
"\n",
"print(\"Time Elapsed:\", time.time()-start_time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Parallel Processing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>async_call_preprocess</B></I>, which contains various default parameters for stages of processing</font>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Order is not maintained\n",
"start_time = time.time()\n",
"\n",
"# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))\n",
"processed_text_par = async_call_preprocess(df['body'].tolist(), stages, 2)\n",
"\n",
"print(\"Time Elapsed:\", time.time()-start_time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Write to Disk"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_new = pd.DataFrame({'id': df['id'].tolist(), 'processed_text': processed_text})\n",
"df_new.to_excel('processed.xlsx', index=False)\n",
"df_new.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<hr>\n",
"<B><I>IGNORE - Trials for Multi-Thread</I></B><br>\n",
"<font color='purple'>Turns out processes are faster and run simultaneously allowing parallel processing. Threads are better for I/O sequences</font>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from multiprocessing.pool import ThreadPool\n",
"\n",
"def async_call_preprocess(strings, stages, n_processes=3):\n",
" \"\"\"\n",
" Function to create async threads for faster processing. Automatically creates threads and assigns data to each thread call\n",
" \n",
" :param strings: A list of strings to be preprocessed\n",
" :param stages: A dictionary with keys as stages and values as Boolean/Integer. Can be used to customize the stages in preprocessing\n",
" :param n_processes: Integer value of number of threads to be created\n",
" (Default parameters for stages)\n",
" {'remove_tags_nonascii': True, 'lower_case': True,'expand_contractions': False, 'remove_punctuation': True, 'remove_escape_chars': True, 'remove_stopwords': False, 'remove_numbers': True, 'lemmatize': False, 'stemming': False, 'min_word_len': 2\n",
" \n",
" <Returns a list of preprocessed strings, aggregated from threads>\n",
" \"\"\"\n",
" pool = ThreadPool(processes=n_processes)\n",
" \n",
" # Note the start time\n",
" start_time = time.time()\n",
" \n",
" # Calculate the indices of strings to be passed to multiple processes\n",
" ranges = calculate_ranges(len(strings), n_processes)\n",
" print(ranges)\n",
" \n",
" # Create processes and then pass data\n",
" process_dict = dict()\n",
" for i in range(len(ranges)-1):\n",
" string_set = strings[ranges[i] : ranges[i+1]]\n",
" process_dict[i] = pool.apply_async(preprocess_nlp, (string_set, stages)) # tuple of args for foo\n",
" \n",
" # Join the results\n",
" processed_strings = list()\n",
" for i in range(len(ranges)-1):\n",
" processed_strings.append(process_dict[i].get())\n",
" \n",
" for i in range(len(ranges)-1):\n",
" print(len(process_dict[i].get()))\n",
" \n",
" print(\"Time Elapsed:\", time.time()-start_time)\n",
" return processed_strings"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}