~akspecs/numbeo-scraping

feaa9a36a3346a33226069fad13475b46b232dae — Rebecca Medrano 7 months ago ba00d52
spiders: scrape location data from wikipedia

 - rename wiki_images.py to wiki_data.py and scrape location
   coordinates

 - update json2sqlite.py to include latitude and longitude columns
   in 'cities' table
3 files changed, 42 insertions(+), 7 deletions(-)

M numbeo/json2sqlite.py
R numbeo/numbeo/spiders/{wiki_images.py => wiki_data.py}
M numbeo/numbeo/spiders/wiki_urls.py
M numbeo/json2sqlite.py => numbeo/json2sqlite.py +14 -1
@@ 43,6 43,8 @@ cur.execute('''
                city_name VARCHAR(50) NOT NULL,
                region VARCHAR(50),
                country_id INTEGER NOT NULL,
                latitude DECIMAL(3, 6),
                longitude DECIMAL(3, 6),
                city_url VARCHAR(200) NOT NULL UNIQUE,
                wiki_url VARCHAR(200),
                FOREIGN KEY (country_id) REFERENCES countries(id),


@@ 82,6 84,17 @@ with open('wiki_urls.json') as f:
                    WHERE city_id = "{city['city_id']}"
                    '''
        )
 # Add coordinates to cities table from wiki_urls.json
with open('wiki_data.json') as f:
    cities = json.load(f)
    for city in cities:
        cur.execute(f'''
                    UPDATE cities
                    SET latitude = {city['latitude']},
                    longitude = {city['longitude']}
                    WHERE city_id = "{city['city_id']}"
                    '''
        )


# Create quality_of_life table


@@ 310,7 323,7 @@ cur.execute('''
            ''')

# Fill image_urls table from images.json
with open('wiki_images.json') as f:
with open('wiki_data.json') as f:
    cities = json.load(f)
    for city in cities:
        cur.execute('''

R numbeo/numbeo/spiders/wiki_images.py => numbeo/numbeo/spiders/wiki_data.py +28 -5
@@ 17,9 17,9 @@ except sqlite3.OperationalError:
url_ids = {}


class WikiImagesSpider(scrapy.Spider):
    name = 'wiki_images'
    allowed_domains = ['wikipedia.org']
class WikiDataSpider(scrapy.Spider):
    name = 'wiki_data'
    allowed_domains = ['wikipedia.org', 'geohack.toolforge.org']
    start_urls = []
    for city in cities:
        if city[1]:


@@ 39,7 39,30 @@ class WikiImagesSpider(scrapy.Spider):
            '//meta[@property="og:image"]/@content'
        ).get()

        geo_url = 'https:' + response.xpath(
                '//a/@href[contains(., "geohack")]').get()

        if geo_url:
            request = scrapy.Request(url=geo_url, callback=self.parse_geo)
            request.meta['city_id'] = url_ids[response.url]
            request.meta['image_url'] = wiki_img
            yield request
        else:
            yield {
                'city_id': url_ids[response.url],
                'image_url': wiki_img,
                'latitude': '',
                'longitude': '',
            }

    def parse_geo(self, response):
        latitude = response.xpath('//span[@class="latitude p-latitude"]/text()'
                   ).get()
        longitude = response.xpath('//span[@class="longitude p-longitude"]/text()'
                    ).get()
        yield {
            'city_id': url_ids[response.url],
            'image_url': wiki_img,
            'city_id': response.meta['city_id'],
            'image_url': response.meta['image_url'],
            'latitude': latitude,
            'longitude': longitude,
        }

M numbeo/numbeo/spiders/wiki_urls.py => numbeo/numbeo/spiders/wiki_urls.py +0 -1
@@ 47,7 47,6 @@ class wikiUrlSpider(scrapy.Spider):

        start_url = response.request.meta.get('redirect_urls')[0]


        yield {
            'city_id': url_ids[start_url],
            'wiki_url': wiki_url,