Scraping Project Euler Site With Scrapy
Solution 1:
I think I have found a simplest yet fitting solution (at least for my purpose), in respect to existent code written to scrape projecteuler:
# -*- coding: utf-8 -*-import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader
classEulerSpider(scrapy.Spider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = ["https://projecteuler.net/archives"]
defparse(self, response):
numpag = response.css("div.pagination a[href]::text").extract()
maxpag = int(numpag[len(numpag) - 1])
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
for i inrange(2, maxpag + 1):
next_page = "https://projecteuler.net/archives;page=" + str(i)
yield response.follow(next_page, self.parse_next)
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
defparse_next(self, response):
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
defparse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
From the start page (archives) I follow every single link to a problem, scraping the data that I need with parse_problems
. Then I launch the scraper for the other pages of the site, with the same procedure for every list of link.
Also the Item definition with pre and post processes is very clean:
import re
import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags
defextract_first_number(text):
i = re.search('\d+', text)
returnint(text[i.start():i.end()])
defarray_to_value(element):
return element[0]
classProblem(scrapy.Item):
id = scrapy.Field(
input_processor=MapCompose(remove_tags, extract_first_number),
output_processor=Compose(array_to_value)
)
title = scrapy.Field(input_processor=MapCompose(remove_tags))
content = scrapy.Field()
I launch this with the command scrapy crawl euler -o euler.json
and it outputs an array of unordered json objects, everyone corrisponding to a single problem: this is fine for me because I'm going to process it with javascript, even if I think resolving the ordering problem via scrapy can be very simple.
EDIT: in fact it is simple, using this pipeline
import json
classJsonWriterPipeline(object):
defopen_spider(self, spider):
self.list_items = []
self.file = open('euler.json', 'w')
defclose_spider(self, spider):
ordered_list = [Nonefor i inrange(len(self.list_items))]
self.file.write("[\n")
for i in self.list_items:
ordered_list[int(i['id']-1)] = json.dumps(dict(i))
for i in ordered_list:
self.file.write(str(i)+",\n")
self.file.write("]\n")
self.file.close()
defprocess_item(self, item, spider):
self.list_items.append(item)
return item
though the best solution may be to create a custom exporter:
from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes
classOrderedJsonItemExporter(JsonItemExporter):
def__init__(self, file, **kwargs):
# To initialize the object we use JsonItemExporter's constructorsuper().__init__(file)
self.list_items = []
defexport_item(self, item):
self.list_items.append(item)
deffinish_exporting(self):
ordered_list = [Nonefor i inrange(len(self.list_items))]
for i in self.list_items:
ordered_list[int(i['id'] - 1)] = i
for i in ordered_list:
if self.first_item:
self.first_item = Falseelse:
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(i))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
self._beautify_newline()
self.file.write(b"]")
and configure it in settings to call it for json:
FEED_EXPORTERS = {
'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}
Post a Comment for "Scraping Project Euler Site With Scrapy"