Skip to content Skip to sidebar Skip to footer

Scraping Project Euler Site With Scrapy

I'm trying to scrape projecteuler.net with python's scrapy library, just to make practice with it. I've seen online more than one existent implementation of such a scraper, but the

Solution 1:

I think I have found a simplest yet fitting solution (at least for my purpose), in respect to existent code written to scrape projecteuler:

# -*- coding: utf-8 -*-import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader


classEulerSpider(scrapy.Spider):
    name = 'euler'
    allowed_domains = ['projecteuler.net']
    start_urls = ["https://projecteuler.net/archives"]

    defparse(self, response):
        numpag = response.css("div.pagination a[href]::text").extract()
        maxpag = int(numpag[len(numpag) - 1])

        for href in response.css("table#problems_table a::attr(href)").extract():
            next_page = "https://projecteuler.net/" + href
            yield response.follow(next_page, self.parse_problems)

        for i inrange(2, maxpag + 1):
            next_page = "https://projecteuler.net/archives;page=" + str(i)
            yield response.follow(next_page, self.parse_next)

        return [scrapy.Request("https://projecteuler.net/archives", self.parse)]

    defparse_next(self, response):
        for href in response.css("table#problems_table a::attr(href)").extract():
            next_page = "https://projecteuler.net/" + href
            yield response.follow(next_page, self.parse_problems)

    defparse_problems(self, response):
        l = ItemLoader(item=Problem(), response=response)
        l.add_css("title", "h2")
        l.add_css("id", "#problem_info")
        l.add_css("content", ".problem_content")

        yield l.load_item()

From the start page (archives) I follow every single link to a problem, scraping the data that I need with parse_problems. Then I launch the scraper for the other pages of the site, with the same procedure for every list of link. Also the Item definition with pre and post processes is very clean:

import re

import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags


defextract_first_number(text):
    i = re.search('\d+', text)
    returnint(text[i.start():i.end()])


defarray_to_value(element):
    return element[0]


classProblem(scrapy.Item):
    id = scrapy.Field(
        input_processor=MapCompose(remove_tags, extract_first_number),
        output_processor=Compose(array_to_value)
    )
    title = scrapy.Field(input_processor=MapCompose(remove_tags))
    content = scrapy.Field()

I launch this with the command scrapy crawl euler -o euler.json and it outputs an array of unordered json objects, everyone corrisponding to a single problem: this is fine for me because I'm going to process it with javascript, even if I think resolving the ordering problem via scrapy can be very simple.

EDIT: in fact it is simple, using this pipeline

import json

classJsonWriterPipeline(object):

    defopen_spider(self, spider):
        self.list_items = []
        self.file = open('euler.json', 'w')

    defclose_spider(self, spider):
        ordered_list = [Nonefor i inrange(len(self.list_items))]

        self.file.write("[\n")

        for i in self.list_items:
            ordered_list[int(i['id']-1)] = json.dumps(dict(i))

        for i in ordered_list:
            self.file.write(str(i)+",\n")

        self.file.write("]\n")
        self.file.close()

    defprocess_item(self, item, spider):
        self.list_items.append(item)
        return item

though the best solution may be to create a custom exporter:

from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes


classOrderedJsonItemExporter(JsonItemExporter):

    def__init__(self, file, **kwargs):
        # To initialize the object we use JsonItemExporter's constructorsuper().__init__(file)
        self.list_items = []

    defexport_item(self, item):
        self.list_items.append(item)

    deffinish_exporting(self):
        ordered_list = [Nonefor i inrange(len(self.list_items))]

        for i in self.list_items:
            ordered_list[int(i['id'] - 1)] = i

        for i in ordered_list:
            if self.first_item:
                self.first_item = Falseelse:
                self.file.write(b',')
                self._beautify_newline()
            itemdict = dict(self._get_serialized_fields(i))
            data = self.encoder.encode(itemdict)
            self.file.write(to_bytes(data, self.encoding))

        self._beautify_newline()
        self.file.write(b"]")

and configure it in settings to call it for json:

FEED_EXPORTERS = {
    'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}

Post a Comment for "Scraping Project Euler Site With Scrapy"