1
0
mirror of https://github.com/nikhiljsk/preprocess_nlp.git synced 2021-10-18 10:21:04 +03:00

vectorization and requirements

This commit is contained in:
nikhiljsk
2020-03-12 15:10:29 +05:30
parent 64ecd2d7d3
commit b07f3fe12a
3 changed files with 163 additions and 176 deletions

View File

@@ -1,8 +1,10 @@
nltk==3.4.5
spacy==2.2.3
yake==0.3.7
numpy==1.18.1
contractions==0.0.24
nltk==3.4.5
matplotlib==3.1.3
yake==0.3.7
spacy==2.2.3
beautifulsoup4==4.8.2
ipython==7.12.0
gensim==3.8.1
ipython==7.13.0
scikit_learn==0.22.2.post1

View File

@@ -16,178 +16,6 @@
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Library built"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import numpy as np\n",
"from itertools import chain\n",
"from gensim.models import KeyedVectors\n",
"from gensim.scripts.glove2word2vec import glove2word2vec\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"\n",
"\n",
"# ------------------------ Count Vectorizer ------------------------\n",
"def custom_tokenizer(para):\n",
" words = list()\n",
" for sent in para.split(' . '):\n",
" words.append(sent.split())\n",
" return list(chain(*words))\n",
"\n",
"\n",
"def count_vectorizer(sentences, params={}):\n",
" default_params = {'strip_accents': None, \n",
" 'lowercase': True,\n",
" 'preprocessor': None, \n",
" 'tokenizer': None, \n",
" 'stop_words': None, \n",
" 'ngram_range': (1, 1), \n",
" 'analyzer': 'word', \n",
" 'max_df': 1.0, \n",
" 'min_df': 1, \n",
" 'max_features': None, \n",
" 'vocabulary': None}\n",
" default_params.update(params)\n",
" \n",
" cv = CountVectorizer(sentences, **default_params)\n",
" cv_trans_sent = cv.fit_transform(sentences)\n",
" \n",
" return cv, cv_trans_sent\n",
"\n",
"\n",
"# ------------------------ TF-IDF ------------------------\n",
"def tfidf_vectorizer(sentences, params={}):\n",
" default_params = {'smooth_idf': True,\n",
" 'use_idf': True,\n",
" 'strip_accents': None, \n",
" 'lowercase': True,\n",
" 'preprocessor': None, \n",
" 'tokenizer': None, \n",
" 'stop_words': None, \n",
" 'ngram_range': (1, 1), \n",
" 'analyzer': 'word', \n",
" 'max_df': 1.0, \n",
" 'min_df': 1, \n",
" 'max_features': None, \n",
" 'vocabulary': None}\n",
" default_params.update(params)\n",
" \n",
" tf = TfidfVectorizer(**default_params)\n",
" tf_trans_sent = tf.fit_transform(sentences)\n",
" \n",
" return tf, tf_trans_sent\n",
"\n",
"\n",
"def top_words_tfidf(tf_obj, doc, topn=20): \n",
" # Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/\n",
" tf_idf_vector = tf_obj.transform(doc)\n",
" tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)\n",
" sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)\n",
" \n",
" feature_names = tf_obj.get_feature_names()\n",
" \n",
" #use only topn items from vector\n",
" sorted_items = sorted_items[:topn]\n",
" score_vals = []\n",
" feature_vals = []\n",
" \n",
" # word index and corresponding tf-idf score\n",
" for idx, score in sorted_items:\n",
" \n",
" #keep track of feature name and its corresponding score\n",
" score_vals.append(round(score, 3))\n",
" feature_vals.append(feature_names[idx])\n",
" \n",
" #create a tuples of feature,score\n",
" results= {}\n",
" for idx in range(len(feature_vals)):\n",
" results[feature_vals[idx]]=score_vals[idx]\n",
"\n",
" return results\n",
"\n",
"\n",
"# ------------------------ Word2Vec ------------------------\n",
"def load_word2vec(path=None):\n",
" try:\n",
" model = KeyedVectors.load_word2vec_format(path, binary=True)\n",
" print(\"Model loaded successfully!\")\n",
" return model\n",
" except Exception as e:\n",
" print(\"Please download the dataset from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit\")\n",
" print(\"-----!! MODEL NOT LOADED !!-----\")\n",
" print('\\n\\n\\nError:\\t', e)\n",
"\n",
" \n",
"def train_word2vec(documents, params={}):\n",
" default_params = {'size': 100,\n",
" 'window': 10,\n",
" 'min_count': 1, \n",
" 'workers': 8}\n",
" default_params.update(params)\n",
" model = gensim.models.Word2Vec (documents, **default_params)\n",
" model.train(documents,total_examples=len(documents),epochs=50)\n",
" \n",
" return model\n",
"\n",
"\n",
"# ------------------------ GloVe ------------------------\n",
"def load_glove(path=None):\n",
" try:\n",
" temp = glove2word2vec(path, path+'.word2vec')\n",
" except Exception as e:\n",
" print(\"Please download the glove.6B.zip dataset from: https://nlp.stanford.edu/projects/glove/\")\n",
" print(\"-----!! MODEL NOT LOADED !!-----\")\n",
" print('\\n\\n\\nError:\\t', e)\n",
" return None\n",
" \n",
" model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)\n",
" print(\"Model loaded successfully!\")\n",
" return model\n",
"\n",
"\n",
"# ------------------------ Word2Vec & GloVe ------------------------\n",
"def get_most_similar(model, pos_word, neg_word=None, topn=1):\n",
" return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)\n",
"\n",
"\n",
"def get_similarity_score(model, w1, w2):\n",
" return model.wv.similarity(w1, w2)\n",
"\n",
"\n",
"def get_sentence_wise_vector(model, docs):\n",
" # Initialize dictionary with existing vocab\n",
" w2v_words = {}\n",
" for ele in list(model.wv.vocab):\n",
" w2v_words[ele] = 0\n",
" \n",
" sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list\n",
" for sent in docs: # for each review/sentence\n",
" sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length\n",
" cnt_words =0; # num of words with a valid vector in the sentence/review\n",
" for word in sent: # for each word in a review/sentence\n",
" if word in w2v_words:\n",
" vec = model.wv[word]\n",
" sent_vec += vec\n",
" cnt_words += 1\n",
" if cnt_words != 0:\n",
" sent_vec /= cnt_words\n",
" sent_vectors.append(sent_vec)\n",
" \n",
" return sent_vectors"
]
},
{
"cell_type": "markdown",
"metadata": {},

View File

@@ -0,0 +1,157 @@
import gensim
import numpy as np
from itertools import chain
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# ------------------------ Count Vectorizer ------------------------
def custom_tokenizer(para):
words = list()
for sent in para.split(' . '):
words.append(sent.split())
return list(chain(*words))
def count_vectorizer(sentences, params={}):
default_params = {'strip_accents': None,
'lowercase': True,
'preprocessor': None,
'tokenizer': None,
'stop_words': None,
'ngram_range': (1, 1),
'analyzer': 'word',
'max_df': 1.0,
'min_df': 1,
'max_features': None,
'vocabulary': None}
default_params.update(params)
cv = CountVectorizer(sentences, **default_params)
cv_trans_sent = cv.fit_transform(sentences)
return cv, cv_trans_sent
# ------------------------ TF-IDF ------------------------
def tfidf_vectorizer(sentences, params={}):
default_params = {'smooth_idf': True,
'use_idf': True,
'strip_accents': None,
'lowercase': True,
'preprocessor': None,
'tokenizer': None,
'stop_words': None,
'ngram_range': (1, 1),
'analyzer': 'word',
'max_df': 1.0,
'min_df': 1,
'max_features': None,
'vocabulary': None}
default_params.update(params)
tf = TfidfVectorizer(**default_params)
tf_trans_sent = tf.fit_transform(sentences)
return tf, tf_trans_sent
def top_words_tfidf(tf_obj, doc, topn=20):
# Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
tf_idf_vector = tf_obj.transform(doc)
tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
feature_names = tf_obj.get_feature_names()
#use only topn items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
# ------------------------ Word2Vec ------------------------
def load_word2vec(path=None):
try:
model = KeyedVectors.load_word2vec_format(path, binary=True)
print("Model loaded successfully!")
return model
except Exception as e:
print("Please download the dataset from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit")
print("-----!! MODEL NOT LOADED !!-----")
print('\n\n\nError:\t', e)
def train_word2vec(documents, params={}):
default_params = {'size': 100,
'window': 10,
'min_count': 1,
'workers': 8}
default_params.update(params)
model = gensim.models.Word2Vec (documents, **default_params)
model.train(documents,total_examples=len(documents),epochs=50)
return model
# ------------------------ GloVe ------------------------
def load_glove(path=None):
try:
temp = glove2word2vec(path, path+'.word2vec')
except Exception as e:
print("Please download the glove.6B.zip dataset from: https://nlp.stanford.edu/projects/glove/")
print("-----!! MODEL NOT LOADED !!-----")
print('\n\n\nError:\t', e)
return None
model = KeyedVectors.load_word2vec_format(path+'.word2vec', binary=False)
print("Model loaded successfully!")
return model
# ------------------------ Word2Vec & GloVe ------------------------
def get_most_similar(model, pos_word, neg_word=None, topn=1):
return model.wv.most_similar(positive=pos_word, negative=neg_word, topn=topn)
def get_similarity_score(model, w1, w2):
return model.wv.similarity(w1, w2)
def get_sentence_wise_vector(model, docs):
# Initialize dictionary with existing vocab
w2v_words = {}
for ele in list(model.wv.vocab):
w2v_words[ele] = 0
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in docs: # for each review/sentence
sent_vec = np.zeros(model.wv[list(model.wv.vocab.keys())[0]].shape) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
if word in w2v_words:
vec = model.wv[word]
sent_vec += vec
cnt_words += 1
if cnt_words != 0:
sent_vec /= cnt_words
sent_vectors.append(sent_vec)
return sent_vectors