1
0
mirror of https://github.com/fhamborg/news-please.git synced 2021-09-27 00:44:24 +03:00
Files
news-please-crawler/newsplease/helper_classes/url_extractor.py
2016-11-09 18:33:45 +01:00

189 lines
5.9 KiB
Python

"""
Helper class for url extraction.
"""
import re
import os
# import urlparse
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
# import urllib2
try:
import urllib2
except ImportError:
import urllib.request as urllib2
# len(".markdown") = 9
MAX_FILE_EXTENSION_LENGTH = 9
class UrlExtractor(object):
"""
This class contains url related methods.
"""
@staticmethod
def get_allowed_domain(url, allow_subdomains=True):
"""
Determines the url's domain.
:param str url: the url to extract the allowed domain from
:param bool allow_subdomains: determines wether to include subdomains
:return str: subdomains.domain.topleveldomain or domain.topleveldomain
"""
if allow_subdomains:
return re.sub(r'^(www.)',
'', re.search(r'[^/]+\.[^/]+', url).group(0))
else:
return re.search(r'[^/.]+\.[^/.]+$',
UrlExtractor.get_allowed_domain(url)).group(0)
@staticmethod
def get_subdomain(url):
"""
Determines the domain's subdomains.
:param str url: the url to extract any subdomains from
:return str: subdomains of url
"""
allowed_domain = UrlExtractor.get_allowed_domain(url)
return allowed_domain[:len(allowed_domain) - len(
UrlExtractor.get_allowed_domain(url, False))]
@staticmethod
def follow_redirects(url):
"""
Get's the url actual address by following forwards
:param str url: the url to work on
:return str: actual address of url
"""
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
return opener.open(url).url
@staticmethod
def get_sitemap_url(url, allow_subdomains):
"""
Determines the domain's robot.txt
:param str url: the url to work on
:param bool allow_subdomains: Determines if the robot.txt may be the
subdomain's
:return: the robot.txt's address
:raises Exception: if there's no robot.txt on the site's domain
"""
if allow_subdomains:
redirect = UrlExtractor.follow_redirects(
"http://" + UrlExtractor.get_allowed_domain(url)
)
else:
redirect = UrlExtractor.follow_redirects(
"http://" +
UrlExtractor.get_allowed_domain(url, False)
)
redirect = UrlExtractor.follow_redirects(url)
# Get robots.txt
parsed = urlparse(redirect)
if allow_subdomains:
url_netloc = parsed.netloc
else:
url_netloc = UrlExtractor.get_allowed_domain(
parsed.netloc, False)
robots = '{url.scheme}://{url_netloc}/robots.txt'.format(
url=parsed, url_netloc=url_netloc)
try:
urllib2.urlopen(robots)
return robots
except:
if allow_subdomains:
return UrlExtractor.get_sitemap_url(url, False)
else:
raise Exception('Fatal: no robots.txt found.')
@staticmethod
def sitemap_check(url):
"""
Sitemap-Crawler are supported by every site which have a
Sitemap set in the robots.txt.
:param str url: the url to work on
:return bool: Determines if Sitemap is set in the site's robots.txt
"""
response = urllib2.urlopen(UrlExtractor.get_sitemap_url(url, True))
# Check if "Sitemap" is set
return "Sitemap:" in response.read().decode('utf-8')
def get_rss_url(self, response):
"""
Extracts the rss feed's url from the scrapy response.
:param scrapy_response response: the site to extract the rss feed from
:return str: rss feed url
"""
# if this throws an IndexError, then the webpage with the given url
# does not contain a link of type "application/rss+xml"
return response.urljoin(
response.xpath(
'//link[contains(@type, "application/rss+xml")]'
).xpath('@href').extract()[0]
)
@staticmethod
def get_start_url(url):
"""
Determines the start url to start a crawler from
:param str url: the url to extract the start url from
:return str: http://subdomains.domain.topleveldomain/ of url
"""
return "http://" + UrlExtractor.get_allowed_domain(url) + "/"
@staticmethod
def get_url_directory_string(url):
"""
Determines the url's directory string.
:param str url: the url to extract the directory string from
:return str: the directory string on the server
"""
domain = UrlExtractor.get_allowed_domain(url)
splitted_url = url.split('/')
# the following commented list comprehension could replace
# the following for, if not and break statement
# index = [index for index in range(len(splitted_url))
# if not re.search(domain, splitted_url[index]) is None][0]
for index in range(len(splitted_url)):
if not re.search(domain, splitted_url[index]) is None:
if splitted_url[-1] is "":
splitted_url = splitted_url[index + 1:-2]
else:
splitted_url = splitted_url[index + 1:-1]
break
return '_'.join(splitted_url)
@staticmethod
def get_url_file_name(url):
"""
Determines the url's file name.
:param str url: the url to extract the file name from
:return str: the filename (without the file extension) on the server
"""
url_root_ext = os.path.splitext(url)
if len(url_root_ext[1]) <= MAX_FILE_EXTENSION_LENGTH:
return os.path.split(url_root_ext[0])[1]
else:
return os.path.split(url)[1]