~amirouche/ruse-babeltoy

ref: cfe490a7c8237964d34b16df7316fcc5dd60d61d ruse-babeltoy/babeltoy.scm -rw-r--r-- 12.8 KiB
cfe490a7 — Amirouche wip 1 year, 3 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!chezscheme
(import (chezscheme))
(import (babeltoy))
(import (htmlprag))
(import (matchable))
(import (warc))
(import (foundationdb sync))
(import (index))
;;(import (pool))


(define db #f)
(define maximum-number-of-hits #f)


(define (append-map proc objs)
  (apply append (map proc objs)))


(define (random-seed!)
  (random-seed (modulo (time-second (current-time 'monotonic)) (expt 2 32))))

(define (random-uid)
  (let ((uid (make-bytevector 16)))
    (let loop ((index 16))
      (unless (zero? index)
        (bytevector-u8-set! uid (fx- index 1) (random 256))
        (loop (fx- index 1))))
    uid))

(define (call-with-binary-input-port filename proc)
  (define port (open-file-input-port filename))
  (call-with-values (lambda () (proc port))
    (lambda args
      (close-port port)
      (apply values args))))

(define (binary-port->byte-generator port)
  (lambda ()
    (get-u8 port)))

(define (read-query port)
  (define (f port)
    (define char (read-char port))
    (if (eof-object? char)
        (exit 0)
        (if (char=? char #\newline)
            '()
            (cons char (f port)))))

  (define out (list->string (f port)))

  ;; (display "\033[31;1;1mbabeltoy >\033[0m ")(display out)(newline)

  out)

(define (take objs count)
  (let loop ((objs objs)
             (count count)
             (out '()))
    (if (or (fxzero? count) (null? objs))
        (reverse out)
        (loop (cdr objs) (fx- count 1) (cons (car objs) out)))))

;; this is an association between a filepath and a bag-of-words.
(define *index* '())

(define (alist->hashtable alist)
  (define out (make-hashtable equal-hash equal?))
  (let loop ((alist alist))
    (unless (null? alist)
      (hashtable-set! out (caar alist) (cdar alist))
      (loop (cdr alist))))
  out)

(define (directory-list* directory)
  (map (lambda (path) (string-append directory "/" path))
       (directory-list directory)))

(define (sxml->bag sxml)

  (define (string-space? string)
    (let loop ((chars (string->list string)))
      (if (null? chars)
          #t
          (case (car chars)
            ((#\newline #\space) (loop (cdr chars)))
            (else #f)))))

  (define (recurse sxml)
    (cond
     ((or (null? sxml)
          (symbol? sxml)
          (boolean? sxml))
      (cons 0 #f))
     ((string? sxml)
      (if (string-space? sxml)
          (cons 0 #f)
          (cons 1 sxml)))
     (else (case (car sxml)
             (*TOP* (map recurse (cdr sxml)))
             ((*DECL* link meta script style *COMMENT*) (cons 0 #f))
             ;; TODO: microformats / RDF data
             ;; TODO: extract meta keyword graph, twitter data...
             ;; TODO: extract alt and / or title from images and videos
             (else (match sxml
                     (('url string) (cons 20 (sxml->human-readable-text string)))
                     (('title exprs ...) (cons 15 (map sxml->human-readable-text exprs)))
                     (('h1 exprs ...) (cons 10 (map sxml->human-readable-text exprs)))
                     (('h2 exprs ...) (cons 5 (map sxml->human-readable-text exprs)))
                     (('h3 exprs ...) (cons 3 (map sxml->human-readable-text exprs)))
                     (('h4 exprs ...) (cons 3 (map sxml->human-readable-text exprs)))
                     (('h5 exprs ...) (cons 3 (map sxml->human-readable-text exprs)))
                     ((name ('* attrs ...) exprs ...) (map recurse exprs))
                     ((name exprs ...) (map recurse exprs))))))))

  (define (flatten objs)
    (cond
     ((and (pair? objs) (number? (car objs)))
      (if (pair? (cdr objs))
          (list (cons (car objs) (apply string-append (cdr objs))))
          (list objs)))
     ((list? objs)
      (append-map flatten objs))
     ((pair? objs) (list objs))
     (else '())))

  (define scored (filter cdr (flatten (recurse sxml))))

  (define (make-bag-of-words string boost)
    (define (tokenize string)
      (let loop ((chars (string->list (string-downcase string)))
                 (token #f)
                 (out '()))
        (if (null? chars)
            (if token
                (cons (list->string (reverse token)) out)
                out)
            (let ((char (car chars)))
              (case char ;; TODO: replace with SRFI-14's char-set-contains?
                ((#\q #\w #\e #\r #\t #\y #\u #\i #\o #\p #\a #\s #\d #\f
                  #\g #\h #\j #\k #\l #\z #\x #\c #\v #\b #\n #\m
                   #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\0)
                 (if token
                     (loop (cdr chars) (cons char token) out)
                     (loop (cdr chars) (list char) out)))
                (else (if token
                          (loop (cdr chars) #f (cons (list->string (reverse token)) out))
                          (loop (cdr chars) #f out))))))))

    (define (boost! bag)
      (let loop ((keys (vector->list (hashtable-keys bag))))
        (unless (null? keys)
          (let ((score (hashtable-ref bag (car keys) 0)))
            (hashtable-set! bag (car keys) (fx* score boost))
            (loop (cdr keys)))))
      bag)

    (define (list->bag objs)
      (define out (make-hashtable equal-hash equal?))
      (let loop ((objs objs))
        (if (null? objs)
            (boost! out)
            (let* ((obj (car objs))
                   (count (hashtable-ref out obj 0)))
              (hashtable-set! out obj (fx+ count 1))
              (loop (cdr objs))))))

    (list->bag (tokenize string)))


  (define bags (map (lambda (boost+string)
                      (make-bag-of-words (cdr boost+string)
                                         (car boost+string)))
                    scored))

  (define (merge! bags)
    (let ((bag (car bags)))
      (let loop ((bags (cdr bags)))
        (unless (null? bags)
          (vector-for-each (lambda (token+score)
                             (hashtable-set! bag
                                             (car token+score)
                                             (fx+ (hashtable-ref bag (car token+score) 0)
                                                  (cdr token+score))))
                           (hashtable-cells (car bags)))
          (loop (cdr bags))))
      bag))

  (merge! bags))

(define (bag->alist bag)
  (vector->list (hashtable-cells bag)))

(define (ftw proc directory) ;; file tree walk
  ;; If necessary, prefix DIRECTORY with the current directory
  (unless (char=? (string-ref directory 0) #\/)
    (set! directory (string-append (current-directory) "/" directory)))

  (let loopx ((directories (list directory)))
    (unless (null? directories)
      (let loopy ((items (directory-list* (car directories)))
                  (more-directories '()))
        (if (null? items)
            (loopx (append more-directories (cdr directories)))
            (cond
             ((file-regular? (car items))
              (proc (car items))
              (loopy (cdr items) more-directories))
             ((file-directory? (car items))
              (loopy (cdr items) (cons (car items) more-directories)))
             (else (loopy (cdr items) more-directories))))))))

(define (string-suffix-.html? string)
  (and (char=? (string-ref string (fx- (string-length string) 1)) #\l)
       (char=? (string-ref string (fx- (string-length string) 2)) #\m)
       (char=? (string-ref string (fx- (string-length string) 3)) #\t)
       (char=? (string-ref string (fx- (string-length string) 4)) #\h)
       (char=? (string-ref string (fx- (string-length string) 5)) #\.)))

(define (string-suffix-.warc? string)
  (and (char=? (string-ref string (fx- (string-length string) 1)) #\c)
       (char=? (string-ref string (fx- (string-length string) 2)) #\r)
       (char=? (string-ref string (fx- (string-length string) 3)) #\a)
       (char=? (string-ref string (fx- (string-length string) 4)) #\w)
       (char=? (string-ref string (fx- (string-length string) 5)) #\.)))

(define (bag-for-each-unique proc bag)
  (vector-for-each (lambda (key+value)
                     (proc (car key+value)
                           (cdr key+value)))
                   (hashtable-cells bag)))

(define (string->tokens string)
  (define (interesting? token)
    (<= 3 (string-length token) 255))

  (filter interesting?
          (map car
               (string->bag-of-words string))))

(define (index! tx url sxml tokens)
  ;; TODO: Skip if URL contains a query string
  (define uid (random-uid))

  (define ignored (for-each (lambda (token) (index-document-frequency! tx token)) tokens))

  (define (add! token)
    (index-backward-add! tx token uid))

  (index-forward-set! tx uid url sxml)

  (for-each add! tokens)
  (pk 'wip))

(define (string-contains? string char)
  (let loop ((index (string-length string)))
    (if (fxzero? index)
        #f
        (let ((index (fx- index 1)))
          (if (char=? (string-ref string index) char)
              #t
              (loop index))))))

(define (index-prepare-warc! filepath)

  (define (p uri+body)
    (pk 'fuuu (car uri+body))
    (if (string-contains? (car uri+body) #\?)
        #f
        (let* ((sxml (html->sxml (cadr uri+body)))
               (tokens (string->tokens (sxml->human-readable-text sxml))))
          (display ".")(flush-output-port)
          (fdb-in-transaction db
                              (lambda (tx)
                                (index! tx
                                        (car uri+body)
                                        sxml
                                        tokens))))))

  (define (f _)
    (display "o")(flush-output-port))

  ;; (define pool (make-pool 4))

  (pk 'warc-at filepath)
  
  (call-with-binary-input-port filepath
    (lambda (port)
      (define warc (warc-generator (binary-port->byte-generator port)))
      (let loop ((item (warc)))
        (unless (eof-object? item)
          (p item)
          (loop (warc)))))))

(define (index-prepare-on-disk-html! filepath)
  (let ((sxml (call-with-input-file filepath html->sxml)))
    (index! filepath sxml (string->tokens (sxml->human-readable-text sxml)))))

(define (index-prepare!)
  (define (helper! filepath)
    (when (string-suffix-.warc? filepath)
      (index-prepare-warc! filepath)))

  (ftw helper! (cadr (command-line-arguments))))

(define (index-score filepath+bow query)
  (define filepath (car filepath+bow))
  (define bow (cdr filepath+bow))
  (let loop ((query query)
             (score 0))
    (if (null? query)
        (cons filepath score)
        (let* ((token (caar query))
               (score* (hashtable-ref bow token 0)))
          (loop (cdr query) (fx+ score score*))))))

(define (index-search tokens)
  (define document-total (inexact (index-document-frequency-total db)))

  (define (lft tokens frequency token) ;; wanna be least frequent token
    (if (null? tokens)
        token
        (let ((token-frequency (index-document-frequency db (car tokens))))
          (if (fxzero? token-frequency)
              (lft (cdr tokens) frequency token)
              (if (fx>? token-frequency frequency)
                  (lft (cdr tokens) frequency token)
                  (lft (cdr tokens) token-frequency (car tokens)))))))

  (define seed (lft tokens (greatest-fixnum) #f))

  (if (not seed)
      '()
      (let ((candidates (index-backward-ref db seed)))

      (define (index-search-helper url+sxml)
        (define url (car url+sxml))
        (define sxml `(div (url ,url) ,(cdr url+sxml)))
        (define bag (sxml->bag sxml))

        (cons url
              (apply fl+
                     (map (lambda (token)
                            (let ((token-total (inexact (apply fx+ (vector->list (hashtable-values bag))))))
                              (fl*
                               (fl/ (inexact (hashtable-ref bag token 0)) token-total)
                               (fl/ document-total
                                    (fl+ (inexact (index-document-frequency db token)) 1.0)))))
                          tokens))))

      (take (sort (lambda (a b) (> (cdr a) (cdr b)))
                  (map index-search-helper candidates)) maximum-number-of-hits))))

(define (print hit)
  (format #t "~a @ ~a\n" (car hit) (cdr hit)))

(define (read-query-search-print-loop)
  (define prompt (display "\033[31;1;1mbabeltoy >\033[0m "))
  (define tokens (string->tokens (read-query (current-input-port))))
  (if (null? tokens)
      (begin
        (display "1\nmeh...\n")
        (read-query-search-print-loop))
      (begin
        (let ((hits (index-search tokens)))
          (display (length hits))(newline)
          (for-each
           (lambda (a) (format #t "~3f - ~a\n" (cdr a) (car a)))
           hits)
          ;;(for-each print hits)
          (read-query-search-print-loop)))))

(if (and (not (null? (command-line-arguments)))
         (string=? (car (command-line-arguments)) "index"))
    (begin
      (set! db (index-open))
      (index-prepare!)
      (index-close db))
    (begin
      (set! maximum-number-of-hits (string->number (car (command-line-arguments))))
      (set! db (index-open))
      (read-query-search-print-loop)))