@@ 17,9 17,20 @@ except sqlite3.OperationalError:
url_ids = {}
+# Convert coordinates in degree/minute/second form to decimal
+def toDecimalCoordinate(coordinate):
+ coordinate = coordinate.replace('′', '°').replace('″', '°').split('°')
+ decimal = int(coordinate[0]) + \
+ int(coordinate[1]) / 60 + \
+ int(coordinate[2]) / 3600
+ if coordinate[3] in ['W','S']:
+ decimal = -decimal
+ return decimal
+
+
class WikiDataSpider(scrapy.Spider):
name = 'wiki_data'
- allowed_domains = ['wikipedia.org', 'geohack.toolforge.org']
+ allowed_domains = ['wikipedia.org']
start_urls = []
for city in cities:
if city[1]:
@@ 34,35 45,40 @@ class WikiDataSpider(scrapy.Spider):
url_ids[url] = city[0]
start_urls.append(url)
+ def toDecimalCoordinate(coordinate):
+ coordinate = re.split('°′″')
+ decimal = coordinate[0] + \
+ coordinate[1] / 60 + \
+ coordinate[2] / 3600
+ if coordinate[3] in ['W','S']:
+ decimal = -decimal
+ return decimal
+
def parse(self, response):
wiki_img = response.xpath(
'//meta[@property="og:image"]/@content'
).get()
+
+ latitude = toDecimalCoordinate(
+ response.xpath('//span[@class="latitude"]/text()').get()
+ )
- geo_url = 'https:' + response.xpath(
- '//a/@href[contains(., "geohack")]').get()
+ longitude = toDecimalCoordinate(
+ response.xpath('//span[@class="longitude"]/text()').get()
+ )
- if geo_url:
- request = scrapy.Request(url=geo_url, callback=self.parse_geo)
- request.meta['city_id'] = url_ids[response.url]
- request.meta['image_url'] = wiki_img
- yield request
- else:
- yield {
- 'city_id': url_ids[response.url],
- 'image_url': wiki_img,
- 'latitude': '',
- 'longitude': '',
- }
-
- def parse_geo(self, response):
- latitude = response.xpath('//span[@class="latitude p-latitude"]/text()'
- ).get()
- longitude = response.xpath('//span[@class="longitude p-longitude"]/text()'
- ).get()
yield {
- 'city_id': response.meta['city_id'],
- 'image_url': response.meta['image_url'],
+ 'city_id': url_ids[response.url],
+ 'image_url': wiki_img,
'latitude': latitude,
'longitude': longitude,
}
+
+ def toDecimalCoordinate(coordinate):
+ coordinate = re.split('°′″')
+ decimal = coordinate[0] + \
+ coordinate[1] / 60 + \
+ coordinate[2] / 3600
+ if coordinate[3] in ['W','S']:
+ decimal = -decimal
+ return decimal