remove_files

2021-10-18 10:21:04 +03:00 · 2020-02-13 15:15:07 +05:30
parent 4524da571f
commit 3f0c6793c0
2 changed files with 0 additions and 476 deletions
--- a/.ipynb_checkpoints/Feature_Extraction_Trials-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Feature_Extraction_Trials-checkpoint.ipynb
@@ -1,237 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import spacy\n",
-    "import nltk\n",
-    "import yake\n",
-    "\n",
-    "\n",
-    "def remove_duplicates(ele_list):\n",
-    "    new_list = []\n",
-    "    for ele in ele_list:\n",
-    "        if ele not in new_list:\n",
-    "            new_list.append(ele)\n",
-    "    return new_list\n",
-    "\n",
-    "\n",
-    "def append_ngrams(noun_list, n_size=2):\n",
-    "    new_list = list()\n",
-    "    for sentence in noun_list:\n",
-    "        new_list.append(','.join([x+'_'+y for x,y in list(nltk.ngrams(sentence.split(','), n_size))]))\n",
-    "    new_list = [noun_list[i] + ',' + new_list[i] for i in range(len(noun_list))]\n",
-    "    return new_list\n",
-    "\n",
-    "\n",
-    "def parse_doc(nlp, text):\n",
-    "    return nlp(text)\n",
-    "\n",
-    "\n",
-    "def get_noun(doc):\n",
-    "    noun_list = []\n",
-    "    for word in doc:\n",
-    "        if word.pos_ in ['PROPN', 'NOUN']:\n",
-    "            noun_list.append(word.text)\n",
-    "    noun_list = remove_duplicates(noun_list)\n",
-    "    return \",\".join(noun_list)\n",
-    "\n",
-    "\n",
-    "def get_verb(doc):\n",
-    "    verb_list = []\n",
-    "    for word in doc:\n",
-    "        if word.pos_ in ['VERB']:\n",
-    "            verb_list.append(word.text)\n",
-    "    verb_list = remove_duplicates(verb_list)\n",
-    "    return \",\".join(verb_list)\n",
-    "\n",
-    "\n",
-    "def get_ner(doc):\n",
-    "    ner_list = []\n",
-    "    for ent in doc.ents:\n",
-    "        ner_list.append(ent.text)\n",
-    "    ner_list = remove_duplicates(ner_list)\n",
-    "    return \",\".join(ner_list)\n",
-    "\n",
-    "\n",
-    "def get_keywords(docs):\n",
-    "    language = \"en\"\n",
-    "    max_ngram_size = 3\n",
-    "    deduplication_thresold = 0.9\n",
-    "    deduplication_algo = 'seqm'\n",
-    "    numOfKeywords = 1000\n",
-    "    \n",
-    "    list_of_keys = list()\n",
-    "    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, top=numOfKeywords, features=None)\n",
-    "    \n",
-    "    for loc, each_article in enumerate(docs):\n",
-    "        keywords = custom_kw_extractor.extract_keywords(each_article)\n",
-    "        temp1 = list()\n",
-    "        for i, j in keywords:\n",
-    "            temp1.append(j)\n",
-    "        list_of_keys.append(\", \".join(temp1))\n",
-    "    return list_of_keys\n",
-    "\n",
-    "\n",
-    "def get_features(docs, stages):\n",
-    "    default_stages = {\n",
-    "        'nouns': True,\n",
-    "        'verbs': True,\n",
-    "        'noun_phrases': False,\n",
-    "        'keywords': False,\n",
-    "        'ner': True,    \n",
-    "    }\n",
-    "    default_stages = default_stages.update(stages)\n",
-    "    \n",
-    "    t = time.time()\n",
-    "    noun_chunks = list()\n",
-    "    # verbs_list = list()\n",
-    "    nlp = spacy.load('en_core_web_sm')\n",
-    "    # ners_list = list()\n",
-    "    # nouns_list = list()\n",
-    "\n",
-    "#    for text in docs:\n",
-    "#         doc = parse_doc(nlp, text)\n",
-    "        # verbs_list.append(get_verb(doc))\n",
-    "        # ners_list.append(get_ner(doc))\n",
-    "        # nouns_list.append(get_noun(doc))\n",
-    "#        noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))\n",
-    "    \n",
-    "    for doc in nlp.pipe(docs, disable=['ner']):\n",
-    "        # nouns_list.append(get_noun(doc))\n",
-    "        # verbs_list.append(get_verb(doc))\n",
-    "        noun_chunks.append(','.join(remove_duplicates([str(x) for x in list(doc.noun_chunks)])))\n",
-    "\n",
-    "    # keywords = [\",\".join(filter(None, [x, y])) for x, y in zip(noun_chunks, verbs_list)]\n",
-    "    # keywords = append_ngrams(nouns_list, 2)\n",
-    "    # keywords = noun_chunks\n",
-    "    yake_keywords = get_keywords(docs)\n",
-    "    keywords = [\",\".join(filter(None, [x, y])) for x, y in zip(noun_chunks, yake_keywords)]\n",
-    "    keywords = [','.join(set(x.split(','))) for x in keywords]\n",
-    "    print(\"Time elapsed for Keyword Extraction:\", time.time() - t)\n",
-    "    return keywords"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time elapsed for Keyword Extraction: 0.4613072872161865\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['Time,this space,Keywords Extraction, elapsed for keywords, time elapsed, keywords, time, elapsed, extraction, space,keywords extraction, defined',\n",
-       " 'you,what,me']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs = ['Time elapsed for Keywords Extraction would be defined here in this space', 'Also, let me know what you think of this']\n",
-    "get_features(docs, {})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time time NOUN NN nsubjpass Xxxx True False\n",
-      "elapsed elapse VERB VBN acl xxxx True False\n",
-      "for for ADP IN prep xxx True True\n",
-      "Keywords Keywords PROPN NNP compound Xxxxx True False\n",
-      "Extraction Extraction PROPN NNP pobj Xxxxx True False\n",
-      "would would VERB MD aux xxxx True True\n",
-      "be be AUX VB auxpass xx True True\n",
-      "defined define VERB VBN ROOT xxxx True False\n",
-      "here here ADV RB advmod xxxx True True\n",
-      "in in ADP IN prep xx True True\n",
-      "this this DET DT det xxxx True True\n",
-      "space space NOUN NN pobj xxxx True False\n",
-      "Also also ADV RB advmod Xxxx True True\n",
-      ", , PUNCT , punct , False False\n",
-      "let let VERB VB ROOT xxx True False\n",
-      "me -PRON- PRON PRP nsubj xx True True\n",
-      "know know VERB VB ccomp xxxx True False\n",
-      "what what PRON WP dobj xxxx True True\n",
-      "you -PRON- PRON PRP nsubj xxx True True\n",
-      "think think VERB VBP ccomp xxxx True False\n",
-      "of of ADP IN prep xx True True\n",
-      "this this DET DT pobj xxxx True True\n"
-     ]
-    }
-   ],
-   "source": [
-    "nlp = spacy.load('en_core_web_sm')\n",
-    "for doc in docs:\n",
-    "    da = nlp(doc)\n",
-    "    for token in da:\n",
-    "        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
-    "            token.shape_, token.is_alpha, token.is_stop)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Updates "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "* Feature extractions like Nouns, Verbs, Adjectives, Numbers, Noun Phrases, NERs, Keywords\n",
-    "* Vectorization tools like TF-IDF, GloVe, Word2Vec, Bag of Words"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/.ipynb_checkpoints/Preprocessing_Example_Notebook-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Preprocessing_Example_Notebook-checkpoint.ipynb
@@ -1,239 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import pandas as pd\n",
-    "\n",
-    "from preprocess_nlp import preprocess_nlp\n",
-    "from preprocess_nlp import async_call_preprocess"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Read Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "df = pd.read_excel('data.xlsx', nrows=30000)\n",
-    "\n",
-    "# Make sure there are no Null values and the data type is <str>\n",
-    "df.dropna(inplace=True)\n",
-    "df['body'] = df['body'].astype('str')\n",
-    "\n",
-    "print(\"Total strings\", len(df['body'].tolist()))\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Define Stages"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# Default Stages\n",
-    "stages = {'remove_tags_nonascii': True, \n",
-    "          'lower_case': True,\n",
-    "          'expand_contractions': False,\n",
-    "          'remove_escape_chars': True,\n",
-    "          'remove_punctuation': True,\n",
-    "          'remove_stopwords': False,\n",
-    "          'remove_numbers': True,\n",
-    "          'lemmatize': False,\n",
-    "          'stemming': False,\n",
-    "          'min_word_len': 2}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Sequential Processing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>preprocess_nlp</B></I>, which contains various default parameters for stages of processing</font>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "start_time = time.time()\n",
-    "\n",
-    "# Processes data sequential without creating processes (Params - (Strings_to_be_processed, Dict_of_stages))\n",
-    "processed_text_seq = preprocess_nlp(df['body'].tolist(), stages)\n",
-    "\n",
-    "print(\"Time Elapsed:\", time.time()-start_time)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Parallel Processing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<font color='red'>Note: Press <Shift+Tab> to access the docstring of <B><I>async_call_preprocess</B></I>, which contains various default parameters for stages of processing</font>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# Order is not maintained\n",
-    "start_time = time.time()\n",
-    "\n",
-    "# Processes data simultaneously by creating processes (Params - (Strings_to_be_processed, Dict_of_stages, Number_of_processes))\n",
-    "processed_text_par = async_call_preprocess(df['body'].tolist(), stages, 2)\n",
-    "\n",
-    "print(\"Time Elapsed:\", time.time()-start_time)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Write to Disk"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "df_new = pd.DataFrame({'id': df['id'].tolist(), 'processed_text': processed_text})\n",
-    "df_new.to_excel('processed.xlsx', index=False)\n",
-    "df_new.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<hr>\n",
-    "<B><I>IGNORE - Trials for Multi-Thread</I></B><br>\n",
-    "<font color='purple'>Turns out processes are faster and run simultaneously allowing parallel processing. Threads are better for I/O sequences</font>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "from multiprocessing.pool import ThreadPool\n",
-    "\n",
-    "def async_call_preprocess(strings, stages, n_processes=3):\n",
-    "    \"\"\"\n",
-    "    Function to create async threads for faster processing. Automatically creates threads and assigns data to each thread call\n",
-    "    \n",
-    "    :param strings: A list of strings to be preprocessed\n",
-    "    :param stages: A dictionary with keys as stages and values as Boolean/Integer. Can be used to customize the stages in preprocessing\n",
-    "    :param n_processes: Integer value of number of threads to be created\n",
-    "    (Default parameters for stages)\n",
-    "    {'remove_tags_nonascii': True, 'lower_case': True,'expand_contractions': False, 'remove_punctuation': True, 'remove_escape_chars': True, 'remove_stopwords': False, 'remove_numbers': True, 'lemmatize': False, 'stemming': False, 'min_word_len': 2\n",
-    "    \n",
-    "    <Returns a list of preprocessed strings, aggregated from threads>\n",
-    "    \"\"\"\n",
-    "    pool = ThreadPool(processes=n_processes)\n",
-    "    \n",
-    "    # Note the start time\n",
-    "    start_time = time.time()\n",
-    "    \n",
-    "    # Calculate the indices of strings to be passed to multiple processes\n",
-    "    ranges = calculate_ranges(len(strings), n_processes)\n",
-    "    print(ranges)\n",
-    "    \n",
-    "    # Create processes and then pass data\n",
-    "    process_dict = dict()\n",
-    "    for i in range(len(ranges)-1):\n",
-    "        string_set = strings[ranges[i] : ranges[i+1]]\n",
-    "        process_dict[i] = pool.apply_async(preprocess_nlp, (string_set, stages)) # tuple of args for foo\n",
-    "    \n",
-    "    # Join the results\n",
-    "    processed_strings = list()\n",
-    "    for i in range(len(ranges)-1):\n",
-    "        processed_strings.append(process_dict[i].get())\n",
-    "    \n",
-    "    for i in range(len(ranges)-1):\n",
-    "        print(len(process_dict[i].get()))\n",
-    "    \n",
-    "    print(\"Time Elapsed:\", time.time()-start_time)\n",
-    "    return processed_strings"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}