Created using Colaboratory

This commit is contained in:
George Mihaila
2019-03-08 23:56:44 -06:00
parent 9c77c0df71
commit 89259e85a5

166
arxiv_scrape.ipynb Normal file
View File

@@ -0,0 +1,166 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "arxiv_scrape.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/arxiv_scrape.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"metadata": {
"id": "1qRhlyCQttoC",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"## Scrape Arxiv \n",
"\n",
"Main: https://github.com/Mahdisadjadi/arxivscraper\n",
"\n",
"\n",
"https://arxiv.org/help/bulk_data_s3\n",
"\n",
"https://github.com/titipata/arxivpy "
]
},
{
"metadata": {
"id": "Qifv_Iidtqy1",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "3d9a44e8-e9a7-40f0-ee3a-35806ad3eba2"
},
"cell_type": "code",
"source": [
"# used to clean console\n",
"from IPython.display import clear_output\n",
"\n",
"!git clone https://github.com/Mahdisadjadi/arxivscraper.git\n",
"clear_output()\n",
"\n",
"# copy library to where python packages are\n",
"!python arxivscraper/setup.py install\n",
"!cp -r arxivscraper/arxivscraper /usr/local/lib/python3.6/dist-packages/.\n",
"clear_output()\n",
"\n",
"!rm -rf arxivscraper dist arxivscraper.egg-info build\n",
"\n",
"!ls"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"sample_data\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "eKnKpmw-uEBA",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"outputId": "0776f947-cef5-461e-d385-76eb339d0924"
},
"cell_type": "code",
"source": [
"import arxivscraper\n",
"import pandas as pd\n",
"import gc\n",
"\n",
"''' CHANGE YEAR!! '''\n",
"\n",
"\n",
"cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')\n",
"\n",
"def scrape(category_name, category_code, column):\n",
" scraper = arxivscraper.Scraper(category=category_code, date_from='2019-02-01',date_until='2019-02-01')\n",
" output = scraper.scrape()\n",
" df = pd.DataFrame(output,columns=cols)\n",
" df.to_csv('%s.csv'%category_name)\n",
" del scraper, df\n",
" gc.collect()\n",
" \n",
" return output\n",
"\n",
"\n",
"paper_domains = {'Astrophysics':'physics:astro-ph', \n",
" 'Nuclear_Experiment':'physics:nucl-ex', \n",
" 'Mathematics':'math', \n",
" 'Computer_Science':'cs', \n",
" 'Condensed_Matter':'physics:cond-mat', \n",
" 'Mathematical_Physics':'physics:math-ph', \n",
" 'Quantum_Physics':'physics:quant-ph', \n",
" 'Physics':'physics', \n",
" 'General_Relativity_and_Quantum_Cosmology':'physics:gr-qc', \n",
" 'Statistics':'stat', \n",
" 'Quantitative_Biology':'q-bio', \n",
" 'Nonlinear_Sciences':'physics:nlin', \n",
" 'Nuclear_Theory':'physics:nucl-th', \n",
" 'Quantitative_Finance':'q-fin', \n",
" 'High_Energy_Physics_Experiment':'physics:hep-ex', \n",
" 'High_Energy_Physics_Lattice':'physics:hep-lat', \n",
" 'High_Energy_Physics_Phenomenology':'physics:hep-ph', \n",
" 'High_Energy_Physics_Theory':'physics:hep-th'}\n",
"\n",
"all_output = []\n",
"\n",
"for category, code in paper_domains.items():\n",
" output = scrape(category, code, cols)\n",
" all_output += output\n",
" del output\n",
" gc.collect()\n",
" print(category)\n",
" print('So far: >>',len(all_output))\n",
" \n",
" break\n",
"\n",
"!ls"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"http://export.arxiv.org/oai2?verb=ListRecords&from=2019-02-01&until=2019-02-01&metadataPrefix=arXiv&set=physics:astro-ph\n",
"fetching up to 1000 records...\n",
"fetching is completed in 2.8 seconds.\n",
"Total number of records 65\n",
"Astrophysics\n",
"So far: >> 65\n",
"Astrophysics.csv sample_data\n"
],
"name": "stdout"
}
]
}
]
}