mirror of
https://github.com/nikhiljsk/preprocess_nlp.git
synced 2021-10-18 10:21:04 +03:00
vectorization and requirements
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
nltk==3.4.5
|
||||
spacy==2.2.3
|
||||
yake==0.3.7
|
||||
numpy==1.18.1
|
||||
contractions==0.0.24
|
||||
nltk==3.4.5
|
||||
matplotlib==3.1.3
|
||||
yake==0.3.7
|
||||
spacy==2.2.3
|
||||
beautifulsoup4==4.8.2
|
||||
ipython==7.12.0
|
||||
gensim==3.8.1
|
||||
ipython==7.13.0
|
||||
scikit_learn==0.22.2.post1
|
||||
|
||||
@@ -16,178 +16,6 @@
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Library built"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gensim\n",
|
||||
"import numpy as np\n",
|
||||
"from itertools import chain\n",
|
||||
"from gensim.models import KeyedVectors\n",
|
||||
"from gensim.scripts.glove2word2vec import glove2word2vec\n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ------------------------ Count Vectorizer ------------------------\n",
|
||||
"def custom_tokenizer(para):\n",
|
||||
" words = list()\n",
|
||||
" for sent in para.split(' . '):\n",
|
||||
" words.append(sent.split())\n",
|
||||
" return list(chain(*words))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def count_vectorizer(sentences, params={}):\n",
|
||||
" default_params = {'strip_accents': None, \n",
|
||||
" 'lowercase': True,\n",
|
||||
" 'preprocessor': None, \n",
|
||||
" 'tokenizer': None, \n",
|
||||
" 'stop_words': None, \n",
|
||||
" 'ngram_range': (1, 1), \n",
|
||||
" 'analyzer': 'word', \n",
|
||||
" 'max_df': 1.0, \n",
|
||||
" 'min_df': 1, \n",
|
||||
" 'max_features': None, \n",
|
||||
" 'vocabulary': None}\n",
|
||||
" default_params.update(params)\n",
|
||||
" \n",
|
||||
" cv = CountVectorizer(sentences, **default_params)\n",
|
||||
" cv_trans_sent = cv.fit_transform(sentences)\n",
|
||||
" \n",
|
||||
" return cv, cv_trans_sent\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ------------------------ TF-IDF ------------------------\n",
|
||||
"def tfidf_vectorizer(sentences, params={}):\n",
|
||||
" default_params = {'smooth_idf': True,\n",
|
||||
" 'use_idf': True,\n",
|
||||
" 'strip_accents': None, \n",
|
||||
" 'lowercase': True,\n",
|
||||
" 'preprocessor': None, \n",
|
||||
" 'tokenizer': None, \n",
|
||||
" 'stop_words': None, \n",
|
||||
" 'ngram_range': (1, 1), \n",
|
||||
" 'analyzer': 'word', \n",
|
||||
" 'max_df': 1.0, \n",
|
||||
" 'min_df': 1, \n",
|
||||
" 'max_features': None, \n",
|
||||
" 'vocabulary': None}\n",
|
||||
" default_params.update(params)\n",
|
||||
" \n",
|
||||
" tf = TfidfVectorizer(**default_params)\n",
|
||||
" tf_trans_sent = tf.fit_transform(sentences)\n",
|
||||
" \n",
|
||||
" return tf, tf_trans_sent\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def top_words_tfidf(tf_obj, doc, topn=20): \n",
|
||||
" # Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/\n",
|
||||
" tf_idf_vector = tf_obj.transform(doc)\n",
|
||||
" tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)\n",
|
||||
" sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)\n",
|
||||
" \n",
|
||||
" feature_names = tf_obj.get_feature_names()\n",
|
||||
" \n",
|
||||
" #use only topn items from vector\n",
|
||||
" sorted_items = sorted_items[:topn]\n",
|
||||
" score_vals = []\n",
|
||||
" feature_vals = []\n",
|
||||
" \n",
|
||||
" # word index and corresponding tf-idf score\n",
|
||||
" for idx, score in sorted_items:\n",
|
||||
" \n",
|
||||
" #keep track of feature name and its corresponding score\n",
|
||||
" score_vals.append(round(score, 3))\n",
|
||||
" feature_vals.append(feature_names[idx])\n",
|
||||
" \n",
|
||||
" #create a tuples of feature,score\n",
|
||||
" results= {}\n",
|
||||
" for idx in range(len(feature_vals)):\n",
|
||||
" results[feature_vals[idx]]=score_vals[idx]\n",
|
||||
"\n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ------------------------ Word2Vec ------------------------\n",
|
||||
"def load_word2vec(path=None):\n",
|
||||
" try:\n",
|
||||
" model = KeyedVectors.load_word2vec_format(path, binary=True)\n",
|
||||
" print(\"Model loaded successfully!\")\n",
|
||||
" return model\n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"Please download the dataset from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit\")\n",
|
||||
" print(\"-----!! MODEL NOT LOADED !!-----\")\n",
|
||||
" print('\\n\\n\\nError:\\t', e)\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"def train_word2vec(documents, params={}):\n",
|
||||
" default_params = {'size': 100,\n",
|
||||
" 'window': 10,\n",
|
||||
" 'min_count': 1, \n",
|
||||
" 'workers': 8}\n",
|
||||
" default_params.update(params)\n",
|
||||
" model = gensim.models.Word2Vec (documents, **default_params)\n",
|
||||
" model.train(documents,total_examples=len(documents),epochs=50)\n",
|
||||
" \n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ------------------------ GloVe ------------------------\n",
|
||||
"def load_glove(path=None):\n",
|
||||
" try:\n",
|
||||
" temp = glove2word2vec(path, path+'.word2vec')\n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"Please download the glove.6B.zip dataset from: https://nlp.stanford.edu/projects/glove/\")\n",
|
||||
" print(\"-----!! MODEL NOT LOADED !!-----\")\n",
|
||||
" print('\\n\\n\\nError:\\t', e)\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)\n",
|
||||
" print(\"Model loaded successfully!\")\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ------------------------ Word2Vec & GloVe ------------------------\n",
|
||||
"def get_most_similar(model, pos_word, neg_word=None, topn=1):\n",
|
||||
" return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_similarity_score(model, w1, w2):\n",
|
||||
" return model.wv.similarity(w1, w2)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_sentence_wise_vector(model, docs):\n",
|
||||
" # Initialize dictionary with existing vocab\n",
|
||||
" w2v_words = {}\n",
|
||||
" for ele in list(model.wv.vocab):\n",
|
||||
" w2v_words[ele] = 0\n",
|
||||
" \n",
|
||||
" sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list\n",
|
||||
" for sent in docs: # for each review/sentence\n",
|
||||
" sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length\n",
|
||||
" cnt_words =0; # num of words with a valid vector in the sentence/review\n",
|
||||
" for word in sent: # for each word in a review/sentence\n",
|
||||
" if word in w2v_words:\n",
|
||||
" vec = model.wv[word]\n",
|
||||
" sent_vec += vec\n",
|
||||
" cnt_words += 1\n",
|
||||
" if cnt_words != 0:\n",
|
||||
" sent_vec /= cnt_words\n",
|
||||
" sent_vectors.append(sent_vec)\n",
|
||||
" \n",
|
||||
" return sent_vectors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
157
vectorization/vectorization_nlp.py
Normal file
157
vectorization/vectorization_nlp.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import gensim
|
||||
import numpy as np
|
||||
from itertools import chain
|
||||
from gensim.models import KeyedVectors
|
||||
from gensim.scripts.glove2word2vec import glove2word2vec
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
|
||||
|
||||
# ------------------------ Count Vectorizer ------------------------
|
||||
def custom_tokenizer(para):
|
||||
words = list()
|
||||
for sent in para.split(' . '):
|
||||
words.append(sent.split())
|
||||
return list(chain(*words))
|
||||
|
||||
|
||||
def count_vectorizer(sentences, params={}):
|
||||
default_params = {'strip_accents': None,
|
||||
'lowercase': True,
|
||||
'preprocessor': None,
|
||||
'tokenizer': None,
|
||||
'stop_words': None,
|
||||
'ngram_range': (1, 1),
|
||||
'analyzer': 'word',
|
||||
'max_df': 1.0,
|
||||
'min_df': 1,
|
||||
'max_features': None,
|
||||
'vocabulary': None}
|
||||
default_params.update(params)
|
||||
|
||||
cv = CountVectorizer(sentences, **default_params)
|
||||
cv_trans_sent = cv.fit_transform(sentences)
|
||||
|
||||
return cv, cv_trans_sent
|
||||
|
||||
|
||||
# ------------------------ TF-IDF ------------------------
|
||||
def tfidf_vectorizer(sentences, params={}):
|
||||
default_params = {'smooth_idf': True,
|
||||
'use_idf': True,
|
||||
'strip_accents': None,
|
||||
'lowercase': True,
|
||||
'preprocessor': None,
|
||||
'tokenizer': None,
|
||||
'stop_words': None,
|
||||
'ngram_range': (1, 1),
|
||||
'analyzer': 'word',
|
||||
'max_df': 1.0,
|
||||
'min_df': 1,
|
||||
'max_features': None,
|
||||
'vocabulary': None}
|
||||
default_params.update(params)
|
||||
|
||||
tf = TfidfVectorizer(**default_params)
|
||||
tf_trans_sent = tf.fit_transform(sentences)
|
||||
|
||||
return tf, tf_trans_sent
|
||||
|
||||
|
||||
def top_words_tfidf(tf_obj, doc, topn=20):
|
||||
# Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
|
||||
tf_idf_vector = tf_obj.transform(doc)
|
||||
tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
|
||||
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
|
||||
|
||||
feature_names = tf_obj.get_feature_names()
|
||||
|
||||
#use only topn items from vector
|
||||
sorted_items = sorted_items[:topn]
|
||||
score_vals = []
|
||||
feature_vals = []
|
||||
|
||||
# word index and corresponding tf-idf score
|
||||
for idx, score in sorted_items:
|
||||
|
||||
#keep track of feature name and its corresponding score
|
||||
score_vals.append(round(score, 3))
|
||||
feature_vals.append(feature_names[idx])
|
||||
|
||||
#create a tuples of feature,score
|
||||
results= {}
|
||||
for idx in range(len(feature_vals)):
|
||||
results[feature_vals[idx]]=score_vals[idx]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ------------------------ Word2Vec ------------------------
|
||||
def load_word2vec(path=None):
|
||||
try:
|
||||
model = KeyedVectors.load_word2vec_format(path, binary=True)
|
||||
print("Model loaded successfully!")
|
||||
return model
|
||||
except Exception as e:
|
||||
print("Please download the dataset from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit")
|
||||
print("-----!! MODEL NOT LOADED !!-----")
|
||||
print('\n\n\nError:\t', e)
|
||||
|
||||
|
||||
def train_word2vec(documents, params={}):
|
||||
default_params = {'size': 100,
|
||||
'window': 10,
|
||||
'min_count': 1,
|
||||
'workers': 8}
|
||||
default_params.update(params)
|
||||
model = gensim.models.Word2Vec (documents, **default_params)
|
||||
model.train(documents,total_examples=len(documents),epochs=50)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# ------------------------ GloVe ------------------------
|
||||
def load_glove(path=None):
|
||||
try:
|
||||
temp = glove2word2vec(path, path+'.word2vec')
|
||||
except Exception as e:
|
||||
print("Please download the glove.6B.zip dataset from: https://nlp.stanford.edu/projects/glove/")
|
||||
print("-----!! MODEL NOT LOADED !!-----")
|
||||
print('\n\n\nError:\t', e)
|
||||
return None
|
||||
|
||||
model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)
|
||||
print("Model loaded successfully!")
|
||||
return model
|
||||
|
||||
|
||||
# ------------------------ Word2Vec & GloVe ------------------------
|
||||
def get_most_similar(model, pos_word, neg_word=None, topn=1):
|
||||
return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)
|
||||
|
||||
|
||||
def get_similarity_score(model, w1, w2):
|
||||
return model.wv.similarity(w1, w2)
|
||||
|
||||
|
||||
def get_sentence_wise_vector(model, docs):
|
||||
# Initialize dictionary with existing vocab
|
||||
w2v_words = {}
|
||||
for ele in list(model.wv.vocab):
|
||||
w2v_words[ele] = 0
|
||||
|
||||
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
|
||||
for sent in docs: # for each review/sentence
|
||||
sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length
|
||||
cnt_words =0; # num of words with a valid vector in the sentence/review
|
||||
for word in sent: # for each word in a review/sentence
|
||||
if word in w2v_words:
|
||||
vec = model.wv[word]
|
||||
sent_vec += vec
|
||||
cnt_words += 1
|
||||
if cnt_words != 0:
|
||||
sent_vec /= cnt_words
|
||||
sent_vectors.append(sent_vec)
|
||||
|
||||
return sent_vectors
|
||||
Reference in New Issue
Block a user