mirror of
https://github.com/kavgan/nlp-in-practice.git
synced 2021-05-12 18:32:22 +03:00
add example for batch keyword extraction
This commit is contained in:
@@ -34,25 +34,25 @@
|
||||
"text": [
|
||||
"Schema:\n",
|
||||
"\n",
|
||||
" accepted_answer_id float64\n",
|
||||
"answer_count int64\n",
|
||||
" id int64\n",
|
||||
"title object\n",
|
||||
"body object\n",
|
||||
"answer_count int64\n",
|
||||
"comment_count int64\n",
|
||||
"community_owned_date object\n",
|
||||
"creation_date object\n",
|
||||
"favorite_count float64\n",
|
||||
"id int64\n",
|
||||
"last_activity_date object\n",
|
||||
"last_edit_date object\n",
|
||||
"last_editor_display_name object\n",
|
||||
"last_editor_user_id float64\n",
|
||||
"owner_display_name object\n",
|
||||
"owner_user_id float64\n",
|
||||
"post_type_id int64\n",
|
||||
"score int64\n",
|
||||
"tags object\n",
|
||||
"title object\n",
|
||||
"view_count int64\n",
|
||||
"accepted_answer_id float64\n",
|
||||
"favorite_count float64\n",
|
||||
"last_edit_date object\n",
|
||||
"last_editor_user_id float64\n",
|
||||
"community_owned_date object\n",
|
||||
"dtype: object\n",
|
||||
"Number of questions,columns= (20000, 19)\n"
|
||||
]
|
||||
@@ -460,14 +460,6 @@
|
||||
"jsf 0.152\n",
|
||||
"possibility 0.146\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:1089: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
|
||||
" if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -605,6 +597,162 @@
|
||||
"print_results(idx,keywords)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Generate keywords for a batch of documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>doc</th>\n",
|
||||
" <th>keywords</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>serializing a private struct can it be done i ...</td>\n",
|
||||
" <td>{'eclipse': 0.593, 'war': 0.317, 'integrate': ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>how do i prevent floated right content from ov...</td>\n",
|
||||
" <td>{'evaluate': 0.472, 'content': 0.403, 'console...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>gradle command line i m trying to run a shell ...</td>\n",
|
||||
" <td>{'appdomain': 0.409, 'dynamic': 0.384, 'perfor...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>loop variable as parameter in asynchronous fun...</td>\n",
|
||||
" <td>{'image': 0.424, 'jpg': 0.412, 'background': 0...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>canot get the href value hi i need to valid th...</td>\n",
|
||||
" <td>{'uri': 0.371, 'bitmap': 0.318, 'intent': 0.30...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>495</th>\n",
|
||||
" <td>how to unbind click and click submit button in...</td>\n",
|
||||
" <td>{'delphi': 0.617, 'compatible': 0.365, 'win': ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>496</th>\n",
|
||||
" <td>swaggerui auth redirect swaggeruiauth of null ...</td>\n",
|
||||
" <td>{'node': 0.547, 'selectsinglenode': 0.304, 'nu...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>497</th>\n",
|
||||
" <td>ssrs value display error for ssrs conditional ...</td>\n",
|
||||
" <td>{'logo': 0.549, 'step': 0.33, 'triangle': 0.32...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>498</th>\n",
|
||||
" <td>accessing and changing a class instance from a...</td>\n",
|
||||
" <td>{'length': 0.426, 'ev': 0.415, 'introduce': 0....</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>499</th>\n",
|
||||
" <td>how to print the current time in the format da...</td>\n",
|
||||
" <td>{'oauth': 0.388, 'localhost': 0.383, 'sdk': 0....</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>500 rows × 2 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" doc \\\n",
|
||||
"0 serializing a private struct can it be done i ... \n",
|
||||
"1 how do i prevent floated right content from ov... \n",
|
||||
"2 gradle command line i m trying to run a shell ... \n",
|
||||
"3 loop variable as parameter in asynchronous fun... \n",
|
||||
"4 canot get the href value hi i need to valid th... \n",
|
||||
".. ... \n",
|
||||
"495 how to unbind click and click submit button in... \n",
|
||||
"496 swaggerui auth redirect swaggeruiauth of null ... \n",
|
||||
"497 ssrs value display error for ssrs conditional ... \n",
|
||||
"498 accessing and changing a class instance from a... \n",
|
||||
"499 how to print the current time in the format da... \n",
|
||||
"\n",
|
||||
" keywords \n",
|
||||
"0 {'eclipse': 0.593, 'war': 0.317, 'integrate': ... \n",
|
||||
"1 {'evaluate': 0.472, 'content': 0.403, 'console... \n",
|
||||
"2 {'appdomain': 0.409, 'dynamic': 0.384, 'perfor... \n",
|
||||
"3 {'image': 0.424, 'jpg': 0.412, 'background': 0... \n",
|
||||
"4 {'uri': 0.371, 'bitmap': 0.318, 'intent': 0.30... \n",
|
||||
".. ... \n",
|
||||
"495 {'delphi': 0.617, 'compatible': 0.365, 'win': ... \n",
|
||||
"496 {'node': 0.547, 'selectsinglenode': 0.304, 'nu... \n",
|
||||
"497 {'logo': 0.549, 'step': 0.33, 'triangle': 0.32... \n",
|
||||
"498 {'length': 0.426, 'ev': 0.415, 'introduce': 0.... \n",
|
||||
"499 {'oauth': 0.388, 'localhost': 0.383, 'sdk': 0.... \n",
|
||||
"\n",
|
||||
"[500 rows x 2 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#generate tf-idf for all documents in your list. docs_test has 500 documents\n",
|
||||
"tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))\n",
|
||||
"\n",
|
||||
"results=[]\n",
|
||||
"for i in range(tf_idf_vector.shape[0]):\n",
|
||||
" \n",
|
||||
" # get vector for a single document\n",
|
||||
" curr_vector=tf_idf_vector[i]\n",
|
||||
" \n",
|
||||
" #sort the tf-idf vector by descending order of scores\n",
|
||||
" sorted_items=sort_coo(curr_vector.tocoo())\n",
|
||||
"\n",
|
||||
" #extract only the top n; n here is 10\n",
|
||||
" keywords=extract_topn_from_vector(feature_names,sorted_items,10)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" results.append(keywords)\n",
|
||||
"\n",
|
||||
"df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -630,7 +778,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.2"
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Reference in New Issue
Block a user