1
0
mirror of https://github.com/nikhiljsk/preprocess_nlp.git synced 2021-10-18 10:21:04 +03:00

documentation

This commit is contained in:
nikhiljsk
2020-03-12 17:54:52 +05:30
parent eacfdd7286
commit ea9bb428ea
2 changed files with 102 additions and 11 deletions

View File

@@ -240,7 +240,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
"version": "3.6.9"
}
},
"nbformat": 4,

View File

@@ -10,6 +10,13 @@ from sklearn.feature_extraction.text import TfidfVectorizer
# ------------------------ Count Vectorizer ------------------------
def custom_tokenizer(para):
"""
Custom tokenizer, considers the sentences are seperated by ' . '. By Default this is called whenever CountVectorizer or tfidfVectorizer are called
:param para: A string where sentences are seperated by ' . '
<Returns list of words>
"""
words = list()
for sent in para.split(' . '):
words.append(sent.split())
@@ -17,6 +24,14 @@ def custom_tokenizer(para):
def count_vectorizer(sentences, params={}):
"""
count_vectorizer uses the Scikit-learns CountVectoizer.
:param sentences: List of strings
:param params: Dictionary with parameters to be passed to CountVectorizer
<Returns CV_Object and Tranformed_sentences>
"""
default_params = {'strip_accents': None,
'lowercase': True,
'preprocessor': None,
@@ -38,6 +53,14 @@ def count_vectorizer(sentences, params={}):
# ------------------------ TF-IDF ------------------------
def tfidf_vectorizer(sentences, params={}):
"""
tfidf_vectorizer uses the Scikit-learns TfidfVectorizer.
:param sentences: List of strings
:param params: Dictionary with parameters to be passed to TfidfVectorizer
<Returns TFIDF_Object and Tranformed_sentences>
"""
default_params = {'smooth_idf': True,
'use_idf': True,
'strip_accents': None,
@@ -59,36 +82,56 @@ def tfidf_vectorizer(sentences, params={}):
return tf, tf_trans_sent
def top_words_tfidf(tf_obj, doc, topn=20):
def top_words_tfidf(tf_obj, doc, topn=20):
"""
A function to find the top words in a given string(Highest IDF score), according to the TFIDF_Object passed
:param tf_obj: TFIDF Object
:param doc: String where the top words to be extracted
:topn: Number of top words to be returned
<Returns a dictionary of top words, keys-words and values-IDF_score>
"""
# Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
# Transform string to vector and sort the words according to the IDF Scores
tf_idf_vector = tf_obj.transform(doc)
tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# Get the feature names or vocab
feature_names = tf_obj.get_feature_names()
#use only topn items from vector
# Use only topn items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
# Word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
# Keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
# Create a tuples of feature,score
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
# Top words
return results
# ------------------------ Word2Vec ------------------------
def load_word2vec(path=None):
"""
Function to load Word2Vec pretrained model. Gensim Word2Vec.
:param path: Path to the word2vec file
<Returns loaded Gensim Word2Vec model>
"""
try:
model = KeyedVectors.load_word2vec_format(path, binary=True)
print("Model loaded successfully!")
@@ -100,11 +143,20 @@ def load_word2vec(path=None):
def train_word2vec(documents, params={}):
"""
Function to train a Word2Vec model on given corpus. Gensim Word2Vec.
:param documents: List of strings
:param params: Dictionary of parameters to be passed to Word3Vec Gensim
<Returns Gensim Word2Vec model>
"""
default_params = {'size': 100,
'window': 10,
'min_count': 1,
'workers': 8}
default_params.update(params)
model = gensim.models.Word2Vec (documents, **default_params)
model.train(documents,total_examples=len(documents),epochs=50)
@@ -113,6 +165,14 @@ def train_word2vec(documents, params={}):
# ------------------------ GloVe ------------------------
def load_glove(path=None):
"""
Function to load GloVe pretrained model. It converts GloVe vectors to Gensim Word2Vec.
:param path: Path to the GloVe file
<Returns loaded Gensim Word2Vec Model (Converted)>
"""
# Convert the GloVe to Word2Vec
try:
temp = glove2word2vec(path, path+'.word2vec')
except Exception as e:
@@ -121,31 +181,62 @@ def load_glove(path=None):
print('\n\n\nError:\t', e)
return None
# Load the pretrained model
model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)
print("Model loaded successfully!")
return model
# ------------------------ Word2Vec & GloVe ------------------------
def get_most_similar(model, pos_word, neg_word=None, topn=1):
"""
Given a Gensim Word2Vec model, find the top words related to a list of word or word.
:param model: Gensim Word2Vec model
:param pos_word: A single word or list of words, in favour of top words
:param neg_word: A single word or list of words, not in favour of top words
:param topn: Number of top words to be extracted
<Returns list of tuples (word, similarity_score))
"""
return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)
def get_similarity_score(model, w1, w2):
"""
Given a Gensim Word2Vec model, find the similarity_score for w1 & w2
:param model: Gensim Word2Vec model
:param w1: First word
:param w2: Second word
<Returns a float value, similarity between two given word>
"""
return model.wv.similarity(w1, w2)
def get_sentence_wise_vector(model, docs):
"""
Get the word_wise_vectors of Word2Vec or GloVe in each sentence, take average and get a single vector to represent in each sentence
:param model: Gensim Word2Vec model
:param docs: List of lists of words [['here', 'there'],['clear','tear']]
<Returns a list of arrays representing one vector for each sentence>
"""
# Function Code Credits: https://towardsdatascience.com/supercharging-word-vectors-be80ee5513d
# Initialize dictionary with existing vocab
w2v_words = {}
for ele in list(model.wv.vocab):
w2v_words[ele] = 0
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in docs: # for each review/sentence
sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
sent_vectors = []; # The avg-w2v for each sentence/review is stored in this list
for sent in docs: # For each review/sentence
sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # As word vectors are of zero length
cnt_words =0; # Num of words with a valid vector in the sentence/review
for word in sent: # For each word in a review/sentence
if word in w2v_words:
vec = model.wv[word]
sent_vec += vec