vectorization and requirements

2021-10-18 10:21:04 +03:00 · 2020-03-12 15:10:29 +05:30
parent 64ecd2d7d3
commit b07f3fe12a
3 changed files with 163 additions and 176 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,10 @@
-nltk==3.4.5
+spacy==2.2.3
+yake==0.3.7
 numpy==1.18.1
 contractions==0.0.24
+nltk==3.4.5
 matplotlib==3.1.3
-yake==0.3.7
-spacy==2.2.3
 beautifulsoup4==4.8.2
-ipython==7.12.0
+gensim==3.8.1
+ipython==7.13.0
+scikit_learn==0.22.2.post1
--- a/vectorization/Vectorization
+++ b/vectorization/Vectorization
@@ -16,178 +16,6 @@
    "import pandas as pd"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Library built"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import gensim\n",
-    "import numpy as np\n",
-    "from itertools import chain\n",
-    "from gensim.models import KeyedVectors\n",
-    "from gensim.scripts.glove2word2vec import glove2word2vec\n",
-    "from sklearn.feature_extraction.text import CountVectorizer\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "\n",
-    "\n",
-    "\n",
-    "# ------------------------ Count Vectorizer ------------------------\n",
-    "def custom_tokenizer(para):\n",
-    "    words = list()\n",
-    "    for sent in para.split(' . '):\n",
-    "        words.append(sent.split())\n",
-    "    return list(chain(*words))\n",
-    "\n",
-    "\n",
-    "def count_vectorizer(sentences, params={}):\n",
-    "    default_params = {'strip_accents': None, \n",
-    "                    'lowercase': True,\n",
-    "                    'preprocessor': None, \n",
-    "                    'tokenizer': None, \n",
-    "                    'stop_words': None, \n",
-    "                    'ngram_range': (1, 1), \n",
-    "                    'analyzer': 'word', \n",
-    "                    'max_df': 1.0, \n",
-    "                    'min_df': 1, \n",
-    "                    'max_features': None, \n",
-    "                    'vocabulary': None}\n",
-    "    default_params.update(params)\n",
-    "    \n",
-    "    cv = CountVectorizer(sentences, **default_params)\n",
-    "    cv_trans_sent = cv.fit_transform(sentences)\n",
-    "    \n",
-    "    return cv, cv_trans_sent\n",
-    "\n",
-    "\n",
-    "# ------------------------ TF-IDF ------------------------\n",
-    "def tfidf_vectorizer(sentences, params={}):\n",
-    "    default_params = {'smooth_idf': True,\n",
-    "                    'use_idf': True,\n",
-    "                    'strip_accents': None, \n",
-    "                    'lowercase': True,\n",
-    "                    'preprocessor': None, \n",
-    "                    'tokenizer': None, \n",
-    "                    'stop_words': None, \n",
-    "                    'ngram_range': (1, 1), \n",
-    "                    'analyzer': 'word', \n",
-    "                    'max_df': 1.0, \n",
-    "                    'min_df': 1, \n",
-    "                    'max_features': None, \n",
-    "                    'vocabulary': None}\n",
-    "    default_params.update(params)\n",
-    "    \n",
-    "    tf = TfidfVectorizer(**default_params)\n",
-    "    tf_trans_sent = tf.fit_transform(sentences)\n",
-    "    \n",
-    "    return tf, tf_trans_sent\n",
-    "\n",
-    "\n",
-    "def top_words_tfidf(tf_obj, doc, topn=20):  \n",
-    "    # Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/\n",
-    "    tf_idf_vector = tf_obj.transform(doc)\n",
-    "    tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)\n",
-    "    sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)\n",
-    "    \n",
-    "    feature_names = tf_obj.get_feature_names()\n",
-    "    \n",
-    "    #use only topn items from vector\n",
-    "    sorted_items = sorted_items[:topn]\n",
-    "    score_vals = []\n",
-    "    feature_vals = []\n",
-    "    \n",
-    "    # word index and corresponding tf-idf score\n",
-    "    for idx, score in sorted_items:\n",
-    "        \n",
-    "        #keep track of feature name and its corresponding score\n",
-    "        score_vals.append(round(score, 3))\n",
-    "        feature_vals.append(feature_names[idx])\n",
-    " \n",
-    "    #create a tuples of feature,score\n",
-    "    results= {}\n",
-    "    for idx in range(len(feature_vals)):\n",
-    "        results[feature_vals[idx]]=score_vals[idx]\n",
-    "\n",
-    "    return results\n",
-    "\n",
-    "\n",
-    "# ------------------------ Word2Vec ------------------------\n",
-    "def load_word2vec(path=None):\n",
-    "    try:\n",
-    "        model = KeyedVectors.load_word2vec_format(path, binary=True)\n",
-    "        print(\"Model loaded successfully!\")\n",
-    "        return model\n",
-    "    except Exception as e:\n",
-    "        print(\"Please download the dataset from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit\")\n",
-    "        print(\"-----!! MODEL NOT LOADED !!-----\")\n",
-    "        print('\\n\\n\\nError:\\t', e)\n",
-    "\n",
-    "        \n",
-    "def train_word2vec(documents, params={}):\n",
-    "    default_params = {'size': 100,\n",
-    "                     'window': 10,\n",
-    "                     'min_count': 1, \n",
-    "                     'workers': 8}\n",
-    "    default_params.update(params)\n",
-    "    model = gensim.models.Word2Vec (documents, **default_params)\n",
-    "    model.train(documents,total_examples=len(documents),epochs=50)\n",
-    "    \n",
-    "    return model\n",
-    "\n",
-    "\n",
-    "# ------------------------ GloVe ------------------------\n",
-    "def load_glove(path=None):\n",
-    "    try:\n",
-    "        temp = glove2word2vec(path, path+'.word2vec')\n",
-    "    except Exception as e:\n",
-    "        print(\"Please download the glove.6B.zip dataset from: https://nlp.stanford.edu/projects/glove/\")\n",
-    "        print(\"-----!! MODEL NOT LOADED !!-----\")\n",
-    "        print('\\n\\n\\nError:\\t', e)\n",
-    "        return None\n",
-    "    \n",
-    "    model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)\n",
-    "    print(\"Model loaded successfully!\")\n",
-    "    return model\n",
-    "\n",
-    "\n",
-    "# ------------------------ Word2Vec & GloVe ------------------------\n",
-    "def get_most_similar(model, pos_word, neg_word=None, topn=1):\n",
-    "    return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)\n",
-    "\n",
-    "\n",
-    "def get_similarity_score(model, w1, w2):\n",
-    "    return model.wv.similarity(w1, w2)\n",
-    "\n",
-    "\n",
-    "def get_sentence_wise_vector(model, docs):\n",
-    "    # Initialize dictionary with existing vocab\n",
-    "    w2v_words = {}\n",
-    "    for ele in list(model.wv.vocab):\n",
-    "        w2v_words[ele] = 0\n",
-    "    \n",
-    "    sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list\n",
-    "    for sent in docs: # for each review/sentence\n",
-    "        sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length\n",
-    "        cnt_words =0; # num of words with a valid vector in the sentence/review\n",
-    "        for word in sent: # for each word in a review/sentence\n",
-    "            if word in w2v_words:\n",
-    "                vec = model.wv[word]\n",
-    "                sent_vec += vec\n",
-    "                cnt_words += 1\n",
-    "        if cnt_words != 0:\n",
-    "            sent_vec /= cnt_words\n",
-    "        sent_vectors.append(sent_vec)\n",
-    "    \n",
-    "    return sent_vectors"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
--- a/vectorization/vectorization_nlp.py
+++ b/vectorization/vectorization_nlp.py
@@ -0,0 +1,157 @@
+import gensim
+import numpy as np
+from itertools import chain
+from gensim.models import KeyedVectors
+from gensim.scripts.glove2word2vec import glove2word2vec
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+
+# ------------------------ Count Vectorizer ------------------------
+def custom_tokenizer(para):
+    words = list()
+    for sent in para.split(' . '):
+        words.append(sent.split())
+    return list(chain(*words))
+
+
+def count_vectorizer(sentences, params={}):
+    default_params = {'strip_accents': None, 
+                    'lowercase': True,
+                    'preprocessor': None, 
+                    'tokenizer': None, 
+                    'stop_words': None, 
+                    'ngram_range': (1, 1), 
+                    'analyzer': 'word', 
+                    'max_df': 1.0, 
+                    'min_df': 1, 
+                    'max_features': None, 
+                    'vocabulary': None}
+    default_params.update(params)
+    
+    cv = CountVectorizer(sentences, **default_params)
+    cv_trans_sent = cv.fit_transform(sentences)
+    
+    return cv, cv_trans_sent
+
+
+# ------------------------ TF-IDF ------------------------
+def tfidf_vectorizer(sentences, params={}):
+    default_params = {'smooth_idf': True,
+                    'use_idf': True,
+                    'strip_accents': None, 
+                    'lowercase': True,
+                    'preprocessor': None, 
+                    'tokenizer': None, 
+                    'stop_words': None, 
+                    'ngram_range': (1, 1), 
+                    'analyzer': 'word', 
+                    'max_df': 1.0, 
+                    'min_df': 1, 
+                    'max_features': None, 
+                    'vocabulary': None}
+    default_params.update(params)
+    
+    tf = TfidfVectorizer(**default_params)
+    tf_trans_sent = tf.fit_transform(sentences)
+    
+    return tf, tf_trans_sent
+
+
+def top_words_tfidf(tf_obj, doc, topn=20):  
+    # Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
+    tf_idf_vector = tf_obj.transform(doc)
+    tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
+    sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
+    
+    feature_names = tf_obj.get_feature_names()
+    
+    #use only topn items from vector
+    sorted_items = sorted_items[:topn]
+    score_vals = []
+    feature_vals = []
+    
+    # word index and corresponding tf-idf score
+    for idx, score in sorted_items:
+        
+        #keep track of feature name and its corresponding score
+        score_vals.append(round(score, 3))
+        feature_vals.append(feature_names[idx])
+ 
+    #create a tuples of feature,score
+    results= {}
+    for idx in range(len(feature_vals)):
+        results[feature_vals[idx]]=score_vals[idx]
+
+    return results
+
+
+# ------------------------ Word2Vec ------------------------
+def load_word2vec(path=None):
+    try:
+        model = KeyedVectors.load_word2vec_format(path, binary=True)
+        print("Model loaded successfully!")
+        return model
+    except Exception as e:
+        print("Please download the dataset from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit")
+        print("-----!! MODEL NOT LOADED !!-----")
+        print('\n\n\nError:\t', e)
+
+        
+def train_word2vec(documents, params={}):
+    default_params = {'size': 100,
+                     'window': 10,
+                     'min_count': 1, 
+                     'workers': 8}
+    default_params.update(params)
+    model = gensim.models.Word2Vec (documents, **default_params)
+    model.train(documents,total_examples=len(documents),epochs=50)
+    
+    return model
+
+
+# ------------------------ GloVe ------------------------
+def load_glove(path=None):
+    try:
+        temp = glove2word2vec(path, path+'.word2vec')
+    except Exception as e:
+        print("Please download the glove.6B.zip dataset from: https://nlp.stanford.edu/projects/glove/")
+        print("-----!! MODEL NOT LOADED !!-----")
+        print('\n\n\nError:\t', e)
+        return None
+    
+    model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)
+    print("Model loaded successfully!")
+    return model
+
+
+# ------------------------ Word2Vec & GloVe ------------------------
+def get_most_similar(model, pos_word, neg_word=None, topn=1):
+    return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)
+
+
+def get_similarity_score(model, w1, w2):
+    return model.wv.similarity(w1, w2)
+
+
+def get_sentence_wise_vector(model, docs):
+    # Initialize dictionary with existing vocab
+    w2v_words = {}
+    for ele in list(model.wv.vocab):
+        w2v_words[ele] = 0
+    
+    sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
+    for sent in docs: # for each review/sentence
+        sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length
+        cnt_words =0; # num of words with a valid vector in the sentence/review
+        for word in sent: # for each word in a review/sentence
+            if word in w2v_words:
+                vec = model.wv[word]
+                sent_vec += vec
+                cnt_words += 1
+        if cnt_words != 0:
+            sent_vec /= cnt_words
+        sent_vectors.append(sent_vec)
+    
+    return sent_vectors