mirror of
https://github.com/fhamborg/news-please.git
synced 2021-09-27 00:44:24 +03:00
43 lines
1.4 KiB
Python
43 lines
1.4 KiB
Python
# Define here the models for your scraped items
|
|
#
|
|
# See documentation in:
|
|
# http://doc.scrapy.org/en/latest/topics/items.html
|
|
|
|
import scrapy
|
|
|
|
|
|
class NewscrawlerItem(scrapy.Item):
|
|
# ID of the article in the DB
|
|
db_id = scrapy.Field()
|
|
# Path of the file on the local filesystem
|
|
local_path = scrapy.Field()
|
|
# absolute path of the file on the local filesystem
|
|
abs_local_path = scrapy.Field()
|
|
# When the article was last modified in the DB
|
|
modified_date = scrapy.Field()
|
|
# When the article was downloaded in the DB
|
|
download_date = scrapy.Field()
|
|
# Root domain from which the article came
|
|
source_domain = scrapy.Field()
|
|
url = scrapy.Field()
|
|
# Title of the article
|
|
html_title = scrapy.Field()
|
|
# Response object from crawler
|
|
spider_response = scrapy.Field()
|
|
# Title of the article as store in the RSS feed
|
|
rss_title = scrapy.Field()
|
|
# Extracted article title
|
|
article_title = scrapy.Field()
|
|
# Extracted article description
|
|
article_description = scrapy.Field()
|
|
# Extracted article text body
|
|
article_text = scrapy.Field()
|
|
# Extracted top image of the article
|
|
article_image = scrapy.Field()
|
|
# Extracted article author
|
|
article_author = scrapy.Field()
|
|
# Extracted publishing date
|
|
article_publish_date = scrapy.Field()
|
|
# Extracted language of the article
|
|
article_language = scrapy.Field()
|