1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/crawler/items.py
2016-11-09 18:33:45 +01:00

43 lines
1.4 KiB
Python

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NewscrawlerItem(scrapy.Item):
# ID of the article in the DB
db_id = scrapy.Field()
# Path of the file on the local filesystem
local_path = scrapy.Field()
# absolute path of the file on the local filesystem
abs_local_path = scrapy.Field()
# When the article was last modified in the DB
modified_date = scrapy.Field()
# When the article was downloaded in the DB
download_date = scrapy.Field()
# Root domain from which the article came
source_domain = scrapy.Field()
url = scrapy.Field()
# Title of the article
html_title = scrapy.Field()
# Response object from crawler
spider_response = scrapy.Field()
# Title of the article as store in the RSS feed
rss_title = scrapy.Field()
# Extracted article title
article_title = scrapy.Field()
# Extracted article description
article_description = scrapy.Field()
# Extracted article text body
article_text = scrapy.Field()
# Extracted top image of the article
article_image = scrapy.Field()
# Extracted article author
article_author = scrapy.Field()
# Extracted publishing date
article_publish_date = scrapy.Field()
# Extracted language of the article
article_language = scrapy.Field()