~akspecs/numbeo-scraping

bf0e1887873924336657d8e8c7a93c46976c1d2d — Rebecca Medrano 7 months ago feaa9a3
spiders: update wiki_data location scraping

 - get coordinates directly from start_urls; this fixes an issue
   where data was getting mixed up when scraping concurrently
1 files changed, 39 insertions(+), 23 deletions(-)

M numbeo/numbeo/spiders/wiki_data.py
M numbeo/numbeo/spiders/wiki_data.py => numbeo/numbeo/spiders/wiki_data.py +39 -23
@@ 17,9 17,20 @@ except sqlite3.OperationalError:
url_ids = {}


# Convert coordinates in degree/minute/second form to decimal
def toDecimalCoordinate(coordinate):
    coordinate = coordinate.replace('′', '°').replace('″', '°').split('°')
    decimal = int(coordinate[0]) + \
              int(coordinate[1]) / 60 + \
              int(coordinate[2]) / 3600
    if coordinate[3] in ['W','S']:
        decimal = -decimal
    return decimal


class WikiDataSpider(scrapy.Spider):
    name = 'wiki_data'
    allowed_domains = ['wikipedia.org', 'geohack.toolforge.org']
    allowed_domains = ['wikipedia.org']
    start_urls = []
    for city in cities:
        if city[1]:


@@ 34,35 45,40 @@ class WikiDataSpider(scrapy.Spider):
        url_ids[url] = city[0]
        start_urls.append(url)

    def toDecimalCoordinate(coordinate):
        coordinate = re.split('°′″')
        decimal = coordinate[0] + \
                  coordinate[1] / 60 + \
                  coordinate[2] / 3600
        if coordinate[3] in ['W','S']:
            decimal = -decimal
        return decimal

    def parse(self, response):
        wiki_img = response.xpath(
            '//meta[@property="og:image"]/@content'
        ).get()
        
        latitude = toDecimalCoordinate(
                       response.xpath('//span[@class="latitude"]/text()').get()
                   )

        geo_url = 'https:' + response.xpath(
                '//a/@href[contains(., "geohack")]').get()
        longitude = toDecimalCoordinate(
                        response.xpath('//span[@class="longitude"]/text()').get()
                    )

        if geo_url:
            request = scrapy.Request(url=geo_url, callback=self.parse_geo)
            request.meta['city_id'] = url_ids[response.url]
            request.meta['image_url'] = wiki_img
            yield request
        else:
            yield {
                'city_id': url_ids[response.url],
                'image_url': wiki_img,
                'latitude': '',
                'longitude': '',
            }

    def parse_geo(self, response):
        latitude = response.xpath('//span[@class="latitude p-latitude"]/text()'
                   ).get()
        longitude = response.xpath('//span[@class="longitude p-longitude"]/text()'
                    ).get()
        yield {
            'city_id': response.meta['city_id'],
            'image_url': response.meta['image_url'],
            'city_id': url_ids[response.url],
            'image_url': wiki_img,
            'latitude': latitude,
            'longitude': longitude,
        }

    def toDecimalCoordinate(coordinate):
        coordinate = re.split('°′″')
        decimal = coordinate[0] + \
                  coordinate[1] / 60 + \
                  coordinate[2] / 3600
        if coordinate[3] in ['W','S']:
            decimal = -decimal
        return decimal