A crawler.lisp => crawler.lisp +54 -0
@@ 0,0 1,54 @@
+(defvar *wikipedia-api* "https://en.wikipedia.org/w/api.php")
+
+(defun query-wikipedia (input)
+ "Given a string input return the url of the most relevant wikipedia
+ article."
+ (let ((parameters `(("action" . "opensearch")
+ ("format" . "json")
+ ("limit" . "10")
+ ("search" . ,input))))
+ (first
+ (cadddr
+ (cl-json:decode-json-from-string
+ (map 'string #'code-char
+ (drakma:http-request *wikipedia-api* :parameters parameters)))))))
+
+(defun parse-html (url)
+ "Given a wikipedia url parse the html into a PLUMP-DOM element"
+ (lquery:$ (initialize (drakma:http-request url))))
+
+(defun inside-parens (w p)
+ "Given a word and paragraph check if the word is inside
+ parentheses."
+ (let* ((w-point (search w p))
+ (prefix (subseq p 0 w-point)))
+ (not (= (count #\( prefix)
+ (count #\) prefix)))))
+
+(defun validate-link (a p)
+ "Given a link and its paragraph as PLUMP-DOM elements, check if the
+ link is valid, meaning it's not inside parentheses, it's not a link
+ to current page, external or red link."
+ (let ((a-href (aref (lquery:$ a (attr :href)) 0))
+ (a-text (aref (lquery:$ a (text)) 0))
+ (p-text (aref (lquery:$ p (text)) 0)))
+ (and (not (search ":" a-href))
+ (not (search "//" a-href))
+ (not (search "#" a-href))
+ (search "/wiki/" a-href)
+ (not (inside-parens a-text p-text)))))
+
+(defun grab-link (url)
+ "Given a wikipedia url return the first link in the articles
+ content.
+
+ It shall adhere to the following rules:
+ 1. Selects the first non-parenthesized, non-italicized link
+ 2. Ignores external links, links to the current page, or red
+ links (links to non-existent pages)"
+ (let ((content (parse-html url)))
+ (loop named outer for p across (lquery:$ content "#mw-content-text p")
+ do (loop named inner for a across (lquery:$ p "a")
+ do (if (validate-link a p)
+ (return-from outer
+ (aref (lquery:$ a (attr :href)) 0)))))))
A packages.lisp => packages.lisp +2 -0
@@ 0,0 1,2 @@
+(defpackage wikisophy.crawler
+ (:use :cl))
M wikisophy.asd => wikisophy.asd +4 -2
@@ 17,5 17,7 @@
:serial t
:depends-on (:hunchentoot
:drakma
- :cl-json)
- :components ((:file "wikisophy")))
+ :cl-json
+ :lquery)
+ :components ((:file "packages")
+ (:file "crawler")))