add example for batch keyword extraction

This commit is contained in:
kavgan
2020-02-12 10:37:12 -07:00
parent af89000adb
commit c61b9a97d6

View File

@@ -34,25 +34,25 @@
"text": [
"Schema:\n",
"\n",
" accepted_answer_id float64\n",
"answer_count int64\n",
" id int64\n",
"title object\n",
"body object\n",
"answer_count int64\n",
"comment_count int64\n",
"community_owned_date object\n",
"creation_date object\n",
"favorite_count float64\n",
"id int64\n",
"last_activity_date object\n",
"last_edit_date object\n",
"last_editor_display_name object\n",
"last_editor_user_id float64\n",
"owner_display_name object\n",
"owner_user_id float64\n",
"post_type_id int64\n",
"score int64\n",
"tags object\n",
"title object\n",
"view_count int64\n",
"accepted_answer_id float64\n",
"favorite_count float64\n",
"last_edit_date object\n",
"last_editor_user_id float64\n",
"community_owned_date object\n",
"dtype: object\n",
"Number of questions,columns= (20000, 19)\n"
]
@@ -460,14 +460,6 @@
"jsf 0.152\n",
"possibility 0.146\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:1089: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
]
}
],
"source": [
@@ -605,6 +597,162 @@
"print_results(idx,keywords)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate keywords for a batch of documents"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>doc</th>\n",
" <th>keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>serializing a private struct can it be done i ...</td>\n",
" <td>{'eclipse': 0.593, 'war': 0.317, 'integrate': ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>how do i prevent floated right content from ov...</td>\n",
" <td>{'evaluate': 0.472, 'content': 0.403, 'console...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>gradle command line i m trying to run a shell ...</td>\n",
" <td>{'appdomain': 0.409, 'dynamic': 0.384, 'perfor...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>loop variable as parameter in asynchronous fun...</td>\n",
" <td>{'image': 0.424, 'jpg': 0.412, 'background': 0...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>canot get the href value hi i need to valid th...</td>\n",
" <td>{'uri': 0.371, 'bitmap': 0.318, 'intent': 0.30...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>495</th>\n",
" <td>how to unbind click and click submit button in...</td>\n",
" <td>{'delphi': 0.617, 'compatible': 0.365, 'win': ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>496</th>\n",
" <td>swaggerui auth redirect swaggeruiauth of null ...</td>\n",
" <td>{'node': 0.547, 'selectsinglenode': 0.304, 'nu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>497</th>\n",
" <td>ssrs value display error for ssrs conditional ...</td>\n",
" <td>{'logo': 0.549, 'step': 0.33, 'triangle': 0.32...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>498</th>\n",
" <td>accessing and changing a class instance from a...</td>\n",
" <td>{'length': 0.426, 'ev': 0.415, 'introduce': 0....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>499</th>\n",
" <td>how to print the current time in the format da...</td>\n",
" <td>{'oauth': 0.388, 'localhost': 0.383, 'sdk': 0....</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>500 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" doc \\\n",
"0 serializing a private struct can it be done i ... \n",
"1 how do i prevent floated right content from ov... \n",
"2 gradle command line i m trying to run a shell ... \n",
"3 loop variable as parameter in asynchronous fun... \n",
"4 canot get the href value hi i need to valid th... \n",
".. ... \n",
"495 how to unbind click and click submit button in... \n",
"496 swaggerui auth redirect swaggeruiauth of null ... \n",
"497 ssrs value display error for ssrs conditional ... \n",
"498 accessing and changing a class instance from a... \n",
"499 how to print the current time in the format da... \n",
"\n",
" keywords \n",
"0 {'eclipse': 0.593, 'war': 0.317, 'integrate': ... \n",
"1 {'evaluate': 0.472, 'content': 0.403, 'console... \n",
"2 {'appdomain': 0.409, 'dynamic': 0.384, 'perfor... \n",
"3 {'image': 0.424, 'jpg': 0.412, 'background': 0... \n",
"4 {'uri': 0.371, 'bitmap': 0.318, 'intent': 0.30... \n",
".. ... \n",
"495 {'delphi': 0.617, 'compatible': 0.365, 'win': ... \n",
"496 {'node': 0.547, 'selectsinglenode': 0.304, 'nu... \n",
"497 {'logo': 0.549, 'step': 0.33, 'triangle': 0.32... \n",
"498 {'length': 0.426, 'ev': 0.415, 'introduce': 0.... \n",
"499 {'oauth': 0.388, 'localhost': 0.383, 'sdk': 0.... \n",
"\n",
"[500 rows x 2 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#generate tf-idf for all documents in your list. docs_test has 500 documents\n",
"tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))\n",
"\n",
"results=[]\n",
"for i in range(tf_idf_vector.shape[0]):\n",
" \n",
" # get vector for a single document\n",
" curr_vector=tf_idf_vector[i]\n",
" \n",
" #sort the tf-idf vector by descending order of scores\n",
" sorted_items=sort_coo(curr_vector.tocoo())\n",
"\n",
" #extract only the top n; n here is 10\n",
" keywords=extract_topn_from_vector(feature_names,sorted_items,10)\n",
" \n",
" \n",
" results.append(keywords)\n",
"\n",
"df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -630,7 +778,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
"version": "3.6.5"
}
},
"nbformat": 4,