
;;; prefix trees are a data structure for efficient storage and
;;; manipulation of all substrings of a given piece of text.

(define (fail-map f val) (if val (f val) val))

;; character indexing

(define *maxchar* 256)
(define *charbase* (char->integer #\newline))
(define (char->index ch)
  (modulo (- (char->integer ch) *charbase*) *maxchar*))
(define (index->char idx) (integer->char (+ *charbase* idx)))

;; basic ptree handlers

(define (make-ptree) (cons 0 '()))
(define (ptree? obj)
  (and (pair? obj) (number? (car obj)) (list? (cdr obj))))
(define ptree-freq car)
(define ptree-set-freq! set-car!)
(define ptree-subtree-slot cdr)
(define ptree-set-subtree-slot! set-cdr!)

(define (ptree-subtree pt ch)
  (let ((s (ptree-subtree-slot pt)))
    (if (vector? s)
      (vector-ref s (char->index ch))
      (fail-map cadr (assv ch s)))))

(define (ptree-subtree-alist pt)
  (let ((s (ptree-subtree-slot pt)))
    (if (not (vector? s)) s
      (ptree-subtree-reduce
	(lambda (acc key val) (cons (list key val) acc))
	'() pt))))

(define (ptree-subtree-reduce f init pt)
  (let ((s (ptree-subtree-slot pt)))
    (if (vector? s)
      (let loop ((index 0) (acc init))
	(if (>= index *maxchar*) acc
	  (let ((elem (vector-ref s index)))
	    (loop (+ index 1)
		  (if (not elem) acc
		    (f acc (index->char index) elem))))))
      (let loop ((rest s) (acc init))
	(if (null? rest) acc
	  (loop (cdr rest) (f acc (caar rest) (cadar rest))))))))

;; ptree mutators

(define *split-point* 20)

(define (ptree-inc-freq! pt)
  (let ((freq (ptree-freq pt)))
    (ptree-set-freq! pt (+ 1 freq))
    ; changing to vectors does not seem to pay off
    ;(if (= freq *split-point*) (ptree-change-to-vector! pt))
    pt))

(define (ptree-change-to-vector! pt)
  (if (vector? (ptree-subtree-slot pt)) #f
    (let ((v (make-vector *maxchar* #f)))
      (map (lambda (knot)
	     (vector-set! v (char->index (car knot)) (cadr knot)))
	   (ptree-subtree-slot pt))
      (ptree-set-subtree-slot! pt v))))

(define (ptree-set-subtree! pt ch val)
  (let ((s (ptree-subtree-slot pt)))
    (if (vector? s)
      (vector-set! s (char->index ch) val)
      (let ((knot (assv ch s)))
	(if knot
	  (set-car! (cdr knot) val)
	  (ptree-set-subtree-slot! pt (cons (list ch val) s)))))
    val))

(define (ptree-inc-into-subtree! pt ch)
  (ptree-inc-freq! (or (ptree-subtree pt ch)
		       (ptree-set-subtree! pt ch (make-ptree)))))

;; ptree information retrieval

(define (ptree-subtree-freq pt ch)
  (let ((subt (ptree-subtree pt ch)))
    (if (not subt) 0 (ptree-freq subt))))

(define (ptree-char-probability pt ch)
  (/ (ptree-subtree-freq pt ch) (exact->inexact (ptree-freq pt))))

(define (ptree-contained-strings pt)
  (define (subtree-contained-strings subt)
    (map (lambda (str) (cons (car subt) str))
	 (ptree-contained-strings (cadr subt))))
  (cons '() (apply append (map subtree-contained-strings
			       (ptree-subtree-alist pt)))))

(define (sq x) (* x x))
(define (ptree-subtree-sq-sum pt)
  (ptree-subtree-reduce
    (lambda (acc ch subt) (+ acc (sq (ptree-freq subt)))) 0 pt))

(define (ptree-probability-distribution pt)
  (/ (ptree-subtree-sq-sum pt)
     (exact->inexact (sq (ptree-freq pt)))))

; this is not actually used
(define (chi-square xs)
  (let ((sum (exact->inexact (apply + xs))))
    (/ (apply + (map sq xs)) (sq sum))))

;; ptree learners

(define (map-take f bound ls)
  (if (or (null? ls) (<= bound 0)) '()
    (cons (f (car ls)) (map-take f (- bound 1) (cdr ls)))))

(define (make-ptree-learner pt) (list (ptree-inc-freq! pt)))
(define (ptree-learner-update pt learner ch maxdepth)
  (cons (ptree-inc-freq! pt)
	(map-take (lambda (ptr) (ptree-inc-into-subtree! ptr ch))
		  maxdepth learner)))

(define (ptree-learn pt clist maxdepth)
  (let loop ((lrn (make-ptree-learner pt)) (rest clist))
    (if (snull? rest) lrn
      (loop (ptree-learner-update pt lrn (scar rest) maxdepth)
	    (scdr rest)))))

;; ptree traversers (actually, learners and traversers are the same -
;; they are just treated with separate functions)

(define (filter-map f ls)
  (if (null? ls) ls
    (let ((val (f (car ls))) (rest (filter-map f (cdr ls))))
      (if val (cons val rest) rest))))

(define (make-ptree-trav pt) (list pt))
(define (ptree-trav-update pt trav ch)
  (cons pt (filter-map (lambda (ptr) (ptree-subtree ptr ch)) trav)))

;; next character prediction

(define (filter-max-scores scoring ls)
  (let loop ((max-score 0) (found '()) (ls ls))
    (if (null? ls) found
      (let* ((elem (car ls))
             (score (scoring elem)))
        (loop (max score max-score)
              (cond ((> score max-score) (list elem))
                    ((= score max-score) (cons elem found))
                    (else found))
              (cdr ls))))))

(define (ptree-predict-next-char pt)
  (map car (filter-max-scores
	     (lambda (subt) (ptree-freq (cadr subt)))
	     (ptree-subtree-alist pt))))

;; morphological analysis

(define (list-average ls)
  (/ (apply + ls) (length ls)))

(define *pct-width* 15)
(define (display-pct pct)
  (let ((nchars (* pct *pct-width*)))
    (do ((n 0 (+ n 1))) ((>= n *pct-width*))
      (display (if (< n nchars) #\# #\.)))))

(define (ptree-trav-probability-distribution trav)
  (list-average (map ptree-probability-distribution trav)))

(define (ptree-trav-char-probability trav ch)
  (list-average (map (lambda (pt) (ptree-char-probability pt ch)) trav)))

(define (ptree-annotate pt stream)
  (let loop ((state (make-ptree-trav pt)) (stream stream))
    (if (snull? stream) #t
      (let ((ch (scar stream)))
	(display-pct (ptree-trav-probability-distribution state))
	(display " ") (display ch) (display " ")
	(display-pct (ptree-trav-char-probability state ch))
	(newline)
	(loop (ptree-trav-update pt state ch) (scdr stream))))))

; slower but works with less training data
(define (ptree-morphemes-combined-measure state ch)
; probability distribution is good for words that have one overly common form;
; character probability is good for words we know only one version of.
  (min (ptree-trav-probability-distribution state)
       (ptree-trav-char-probability state ch)))

(define (ptree-morphemes pt stream . maybe-params)
  (let ((cutoff (if (null? maybe-params) 1.9 (car maybe-params)))
	(prob-measure (if (or (null? maybe-params) (null? (cdr maybe-params)))
			ptree-trav-char-probability
			(cadr maybe-params))))
    (let loop ((prev-prob 0.)
	       (acc '())
	       (state (make-ptree-trav pt))
	       (rest stream))
      (if (snull? rest) (scons (list->string (reverse acc)) snil)
	(let* ((ch (scar rest)) 
	       (new-prob (prob-measure state ch))
	       (new-state (ptree-trav-update pt state ch)))
	  (if (or (= new-prob 0.) (> (/ prev-prob new-prob) cutoff))
	    (scons (list->string (reverse acc))
		   (loop new-prob (list ch) new-state (scdr rest)))
	    (loop new-prob (cons ch acc) new-state (scdr rest))))))))

;; generation

(define (pick ls)
  (list-ref ls (random-integer (length ls))))

(define (ptree-choose-next-char pt)
  (let loop ((subs (ptree-subtree-alist pt))
	     (randval (random-integer (ptree-freq pt))))
    (cond ((null? subs) #\space)
	  ((null? (cdr subs)) (caar subs))
	  (else
	   (let ((freq (ptree-freq (cadar subs))))
	     (if (< randval freq) (caar subs)
	       (loop (cdr subs) (- randval freq))))))))

(define (ptree-trav-choose-next-char trav)
  (ptree-choose-next-char
    (car (filter-max-scores ptree-probability-distribution trav))))

(define (ptree-generate pt nchars)
  (let loop ((result '()) (nchars nchars) (trav (make-ptree-trav pt)))
    (if (= nchars 0) (list->string (reverse result))
      (let ((c (ptree-trav-choose-next-char trav)))
	(loop (cons c result) (- nchars 1) (ptree-trav-update pt trav c))))))

