~bendersteed/wikisophy

0541a4d955b45d35b2778696adfd555d752e463d — Dimakakos Dimos 4 years ago 0cb12d0
Add: crawler that finds the valid first link of any article
3 files changed, 60 insertions(+), 2 deletions(-)

A crawler.lisp
A packages.lisp
M wikisophy.asd
A crawler.lisp => crawler.lisp +54 -0
@@ 0,0 1,54 @@
(defvar *wikipedia-api* "https://en.wikipedia.org/w/api.php")

(defun query-wikipedia (input)
  "Given a string input return the url of the most relevant wikipedia
  article."
  (let ((parameters `(("action" . "opensearch")
		      ("format" . "json")
		      ("limit" . "10")
		      ("search" . ,input))))
    (first
     (cadddr
      (cl-json:decode-json-from-string
       (map 'string #'code-char
	    (drakma:http-request *wikipedia-api* :parameters parameters)))))))

(defun parse-html (url)
  "Given a wikipedia url parse the html into a PLUMP-DOM element"
  (lquery:$ (initialize (drakma:http-request url))))

(defun inside-parens (w p)
  "Given a word and paragraph check if the word is inside
  parentheses."
  (let* ((w-point (search w p))
	 (prefix (subseq p 0 w-point)))
    (not (= (count #\( prefix)
	    (count #\) prefix)))))

(defun validate-link (a p)
  "Given a link and its paragraph as PLUMP-DOM elements, check if the
  link is valid, meaning it's not inside parentheses, it's not a link
  to current page, external or red link."
  (let ((a-href (aref (lquery:$ a (attr :href)) 0))
	(a-text (aref (lquery:$ a (text)) 0))
	(p-text (aref (lquery:$ p (text)) 0)))
    (and (not (search ":" a-href))
	 (not (search "//" a-href))
	 (not (search "#" a-href))
	 (search "/wiki/" a-href)
	 (not (inside-parens a-text p-text)))))

(defun grab-link (url)
  "Given a wikipedia url return the first link in the articles
  content.

  It shall adhere to the following rules:
    1. Selects the first non-parenthesized, non-italicized link
    2. Ignores external links, links to the current page, or red 
       links (links to non-existent pages)"
  (let ((content (parse-html url)))
    (loop named outer for p across (lquery:$ content "#mw-content-text p")
	  do (loop named inner for a across (lquery:$ p "a")
		   do (if (validate-link a p)
			  (return-from outer
			    (aref (lquery:$ a (attr :href)) 0)))))))

A packages.lisp => packages.lisp +2 -0
@@ 0,0 1,2 @@
(defpackage wikisophy.crawler
  (:use :cl))

M wikisophy.asd => wikisophy.asd +4 -2
@@ 17,5 17,7 @@
  :serial t
  :depends-on (:hunchentoot
	       :drakma
	       :cl-json)
  :components ((:file "wikisophy")))
	       :cl-json
	       :lquery)
  :components ((:file "packages")
	       (:file "crawler")))