mirror of
https://github.com/gmihaila/ml_things.git
synced 2021-10-04 01:29:04 +03:00
Created using Colaboratory
This commit is contained in:
166
arxiv_scrape.ipynb
Normal file
166
arxiv_scrape.ipynb
Normal file
@@ -0,0 +1,166 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "arxiv_scrape.ipynb",
|
||||
"version": "0.3.2",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"include_colab_link": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/arxiv_scrape.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "1qRhlyCQttoC",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Scrape Arxiv \n",
|
||||
"\n",
|
||||
"Main: https://github.com/Mahdisadjadi/arxivscraper\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"https://arxiv.org/help/bulk_data_s3\n",
|
||||
"\n",
|
||||
"https://github.com/titipata/arxivpy "
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "Qifv_Iidtqy1",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 34
|
||||
},
|
||||
"outputId": "3d9a44e8-e9a7-40f0-ee3a-35806ad3eba2"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# used to clean console\n",
|
||||
"from IPython.display import clear_output\n",
|
||||
"\n",
|
||||
"!git clone https://github.com/Mahdisadjadi/arxivscraper.git\n",
|
||||
"clear_output()\n",
|
||||
"\n",
|
||||
"# copy library to where python packages are\n",
|
||||
"!python arxivscraper/setup.py install\n",
|
||||
"!cp -r arxivscraper/arxivscraper /usr/local/lib/python3.6/dist-packages/.\n",
|
||||
"clear_output()\n",
|
||||
"\n",
|
||||
"!rm -rf arxivscraper dist arxivscraper.egg-info build\n",
|
||||
"\n",
|
||||
"!ls"
|
||||
],
|
||||
"execution_count": 1,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"sample_data\n"
|
||||
],
|
||||
"name": "stdout"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "eKnKpmw-uEBA",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 136
|
||||
},
|
||||
"outputId": "0776f947-cef5-461e-d385-76eb339d0924"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import arxivscraper\n",
|
||||
"import pandas as pd\n",
|
||||
"import gc\n",
|
||||
"\n",
|
||||
"''' CHANGE YEAR!! '''\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')\n",
|
||||
"\n",
|
||||
"def scrape(category_name, category_code, column):\n",
|
||||
" scraper = arxivscraper.Scraper(category=category_code, date_from='2019-02-01',date_until='2019-02-01')\n",
|
||||
" output = scraper.scrape()\n",
|
||||
" df = pd.DataFrame(output,columns=cols)\n",
|
||||
" df.to_csv('%s.csv'%category_name)\n",
|
||||
" del scraper, df\n",
|
||||
" gc.collect()\n",
|
||||
" \n",
|
||||
" return output\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"paper_domains = {'Astrophysics':'physics:astro-ph', \n",
|
||||
" 'Nuclear_Experiment':'physics:nucl-ex', \n",
|
||||
" 'Mathematics':'math', \n",
|
||||
" 'Computer_Science':'cs', \n",
|
||||
" 'Condensed_Matter':'physics:cond-mat', \n",
|
||||
" 'Mathematical_Physics':'physics:math-ph', \n",
|
||||
" 'Quantum_Physics':'physics:quant-ph', \n",
|
||||
" 'Physics':'physics', \n",
|
||||
" 'General_Relativity_and_Quantum_Cosmology':'physics:gr-qc', \n",
|
||||
" 'Statistics':'stat', \n",
|
||||
" 'Quantitative_Biology':'q-bio', \n",
|
||||
" 'Nonlinear_Sciences':'physics:nlin', \n",
|
||||
" 'Nuclear_Theory':'physics:nucl-th', \n",
|
||||
" 'Quantitative_Finance':'q-fin', \n",
|
||||
" 'High_Energy_Physics_Experiment':'physics:hep-ex', \n",
|
||||
" 'High_Energy_Physics_Lattice':'physics:hep-lat', \n",
|
||||
" 'High_Energy_Physics_Phenomenology':'physics:hep-ph', \n",
|
||||
" 'High_Energy_Physics_Theory':'physics:hep-th'}\n",
|
||||
"\n",
|
||||
"all_output = []\n",
|
||||
"\n",
|
||||
"for category, code in paper_domains.items():\n",
|
||||
" output = scrape(category, code, cols)\n",
|
||||
" all_output += output\n",
|
||||
" del output\n",
|
||||
" gc.collect()\n",
|
||||
" print(category)\n",
|
||||
" print('So far: >>',len(all_output))\n",
|
||||
" \n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"!ls"
|
||||
],
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"http://export.arxiv.org/oai2?verb=ListRecords&from=2019-02-01&until=2019-02-01&metadataPrefix=arXiv&set=physics:astro-ph\n",
|
||||
"fetching up to 1000 records...\n",
|
||||
"fetching is completed in 2.8 seconds.\n",
|
||||
"Total number of records 65\n",
|
||||
"Astrophysics\n",
|
||||
"So far: >> 65\n",
|
||||
"Astrophysics.csv sample_data\n"
|
||||
],
|
||||
"name": "stdout"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user