documentation

2021-10-18 10:21:04 +03:00 · 2020-03-12 17:54:52 +05:30
parent eacfdd7286
commit ea9bb428ea
2 changed files with 102 additions and 11 deletions
--- a/vectorization/Vectorization_Example_Notebook.ipynb
+++ b/vectorization/Vectorization_Example_Notebook.ipynb
@@ -240,7 +240,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.6.9"
  }
 },
 "nbformat": 4,
--- a/vectorization/vectorization_nlp.py
+++ b/vectorization/vectorization_nlp.py
@@ -10,6 +10,13 @@ from sklearn.feature_extraction.text import TfidfVectorizer

 # ------------------------ Count Vectorizer ------------------------
 def custom_tokenizer(para):
+    """
+    Custom tokenizer, considers the sentences are seperated by ' . '. By Default this is called whenever CountVectorizer or tfidfVectorizer are called
+    
+    :param para: A string where sentences are seperated by ' . '
+    
+    <Returns list of words>
+    """
    words = list()
    for sent in para.split(' . '):
        words.append(sent.split())
@@ -17,6 +24,14 @@ def custom_tokenizer(para):


 def count_vectorizer(sentences, params={}):
+    """
+    count_vectorizer uses the Scikit-learns CountVectoizer. 
+    
+    :param sentences: List of strings 
+    :param params: Dictionary with parameters to be passed to CountVectorizer
+    
+    <Returns CV_Object and Tranformed_sentences>
+    """
    default_params = {'strip_accents': None, 
                    'lowercase': True,
                    'preprocessor': None, 
@@ -38,6 +53,14 @@ def count_vectorizer(sentences, params={}):

 # ------------------------ TF-IDF ------------------------
 def tfidf_vectorizer(sentences, params={}):
+    """
+    tfidf_vectorizer uses the Scikit-learns TfidfVectorizer. 
+    
+    :param sentences: List of strings 
+    :param params: Dictionary with parameters to be passed to TfidfVectorizer
+    
+    <Returns TFIDF_Object and Tranformed_sentences>
+    """
    default_params = {'smooth_idf': True,
                    'use_idf': True,
                    'strip_accents': None, 
@@ -59,36 +82,56 @@ def tfidf_vectorizer(sentences, params={}):
    return tf, tf_trans_sent


-def top_words_tfidf(tf_obj, doc, topn=20):  
+def top_words_tfidf(tf_obj, doc, topn=20):
+    """
+    A function to find the top words in a given string(Highest IDF score), according to the TFIDF_Object passed
+    
+    :param tf_obj: TFIDF Object
+    :param doc: String where the top words to be extracted
+    :topn: Number of top words to be returned
+    
+    <Returns a dictionary of top words, keys-words and values-IDF_score>
+    """
    # Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
+    
+    # Transform string to vector and sort the words according to the IDF Scores
    tf_idf_vector = tf_obj.transform(doc)
    tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
    sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
    
+    # Get the feature names or vocab
    feature_names = tf_obj.get_feature_names()
    
-    #use only topn items from vector
+    # Use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
-    # word index and corresponding tf-idf score
+    # Word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
-        #keep track of feature name and its corresponding score
+        # Keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
-    #create a tuples of feature,score
+    # Create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

+    # Top words
    return results


 # ------------------------ Word2Vec ------------------------
 def load_word2vec(path=None):
+    """
+    Function to load Word2Vec pretrained model. Gensim Word2Vec.
+    
+    :param path: Path to the word2vec file
+    
+    <Returns loaded Gensim Word2Vec model>
+    """
    try:
        model = KeyedVectors.load_word2vec_format(path, binary=True)
        print("Model loaded successfully!")
@@ -100,11 +143,20 @@ def load_word2vec(path=None):

        
 def train_word2vec(documents, params={}):
+    """
+    Function to train a Word2Vec model on given corpus. Gensim Word2Vec.
+    
+    :param documents: List of strings
+    :param params: Dictionary of parameters to be passed to Word3Vec Gensim
+    
+    <Returns Gensim Word2Vec model>
+    """
    default_params = {'size': 100,
                     'window': 10,
                     'min_count': 1, 
                     'workers': 8}
    default_params.update(params)
+    
    model = gensim.models.Word2Vec (documents, **default_params)
    model.train(documents,total_examples=len(documents),epochs=50)
    
@@ -113,6 +165,14 @@ def train_word2vec(documents, params={}):

 # ------------------------ GloVe ------------------------
 def load_glove(path=None):
+    """
+    Function to load GloVe pretrained model. It converts GloVe vectors to Gensim Word2Vec.
+    
+    :param path: Path to the GloVe file
+    
+    <Returns loaded Gensim Word2Vec Model (Converted)>
+    """
+    # Convert the GloVe to Word2Vec
    try:
        temp = glove2word2vec(path, path+'.word2vec')
    except Exception as e:
@@ -121,31 +181,62 @@ def load_glove(path=None):
        print('\n\n\nError:\t', e)
        return None
    
+    # Load the pretrained model
    model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)
    print("Model loaded successfully!")
+    
    return model


 # ------------------------ Word2Vec & GloVe ------------------------
 def get_most_similar(model, pos_word, neg_word=None, topn=1):
+    """
+    Given a Gensim Word2Vec model, find the top words related to a list of word or word.
+    
+    :param model: Gensim Word2Vec model
+    :param pos_word: A single word or list of words, in favour of top words
+    :param neg_word: A single word or list of words, not in favour of top words
+    :param topn: Number of top words to be extracted
+    
+    <Returns list of tuples (word, similarity_score))
+    """
    return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)


 def get_similarity_score(model, w1, w2):
+    """
+    Given a Gensim Word2Vec model, find the similarity_score for w1 & w2
+    
+    :param model: Gensim Word2Vec model
+    :param w1: First word
+    :param w2: Second word
+    
+    <Returns a float value, similarity between two given word>
+    """
    return model.wv.similarity(w1, w2)


 def get_sentence_wise_vector(model, docs):
+    """
+    Get the word_wise_vectors of Word2Vec or GloVe in each sentence, take average and get a single vector to represent in each sentence
+    
+    :param model: Gensim Word2Vec model
+    :param docs: List of lists of words [['here', 'there'],['clear','tear']]
+    
+    <Returns a list of arrays representing one vector for each sentence>
+    """
+    # Function Code Credits: https://towardsdatascience.com/supercharging-word-vectors-be80ee5513d
+    
    # Initialize dictionary with existing vocab
    w2v_words = {}
    for ele in list(model.wv.vocab):
        w2v_words[ele] = 0
    
-    sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
-    for sent in docs: # for each review/sentence
-        sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length
-        cnt_words =0; # num of words with a valid vector in the sentence/review
-        for word in sent: # for each word in a review/sentence
+    sent_vectors = []; # The avg-w2v for each sentence/review is stored in this list
+    for sent in docs: # For each review/sentence
+        sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # As word vectors are of zero length
+        cnt_words =0; # Num of words with a valid vector in the sentence/review
+        for word in sent: # For each word in a review/sentence
            if word in w2v_words:
                vec = model.wv[word]
                sent_vec += vec