
;;; regular expression into fsa compilation

;; Regular expressions

(define (re->fsa re)
  (cond ((eq? '* re) (fsa-any-item))
	((or (symbol? re) (char? re) (number? re)) (fsa-item re))
	((string? re) (fsa-sequence (string->list re)))
	((null? re) (fsa-success))
	((tagged? 'literal re) (fsa-sequence (cdr re)))
	((tagged? 'none-of re) (fsa-complement (re-altseq->fsa (cdr re))))
	((tagged? 'some-of re) (re-altseq->fsa (cdr re)))
	((tagged? 'all-of re) (re-allseq->fsa (cdr re)))
	((tagged? 'seq-of re) (re-seq->fsa (cdr re)))
	((tagged? 'many-times re) (fsa-repeat (re-seq->fsa (cdr re))))
	(else (error "re->fsa: unknown RE" re))))

(define (fold-re-sequence-with fn)
  (define (loop seq)
    (cond ((null? seq) (error "fold-re-sequence: empty sequence"))
	  ((null? (cdr seq)) (re->fsa (car seq)))
	  (else (fn (re->fsa (car seq)) (loop (cdr seq))))))
  loop)

(define re-altseq->fsa (fold-re-sequence-with fsa-union))
(define re-allseq->fsa (fold-re-sequence-with fsa-intersect))
(define re-seq->fsa (fold-re-sequence-with fsa-concat))

;; "human-friendly" regular expression syntax

(define (break-by ident ls)
  (break (lambda (item) (eq? item ident)) ls))

(define (prefixed? prefix ls)
  (or (null? prefix)
      (and (pair? ls)
	   (or (equal? (car prefix) (car ls))
	       (eq? (car prefix) '?))
	   (prefixed? (cdr prefix) (cdr ls)))))

(define (hfre->fsa hfre)
  (call-with-values
    (lambda () (break-by 'except hfre))
    (lambda (start end)
      (if (null? end) (hfre-alts->fsa start)
	(fsa-intersect (hfre-alts->fsa start)
		       (fsa-complement (hfre-alts->fsa (cdr end))))))))

(define (hfre-alts->fsa hfre)
  (call-with-values
    (lambda () (break-by 'or hfre))
    (lambda (start end)
      (if (null? end) (hfre-seq->fsa start)
	(fsa-union (hfre-seq->fsa start) (hfre->fsa (cdr end)))))))

(define (hfre-seq->fsa hfre)
  (call-with-values
    (lambda () (break-by 'then hfre))
    (lambda (start end)
      (if (null? end) (hfre-conds->fsa start)
	(fsa-concat (hfre-conds->fsa start) (hfre-seq->fsa (cdr end)))))))

(define (hfre-conds->fsa hfre)
  (call-with-values
    (lambda () (break-by 'but hfre))
    (lambda (start end)
      (cond ((null? end) (hfre-atom->fsa start))
	    ((prefixed? '(but also) end)
	     (fsa-intersect (hfre-atom->fsa start)
			    (hfre-conds->fsa (cddr end))))
	    (else (error "hfre->fsa: unknown human-friendly RE" hfre))))))

(define (fsa-containing fsa)
  (fsa-concat (fsa-anything) (fsa-concat fsa (fsa-anything))))

(define (hfre-atom->fsa-containing-exactly n hfre)
  (let ((no-match (fsa-complement (fsa-containing (hfre-atom->fsa hfre)))))
    (if (<= n 0) no-match
      (fsa-concat no-match (fsa-concat (hfre-atom->fsa hfre)
				       (hfre-atom->fsa-containing-exactly (- n 1) hfre))))))

(define (hfre-atom->fsa hfre)
  (cond ((null? hfre) (error "hfre->fsa: syntax"))
	((and (pair? (car hfre)) (null? (cdr hfre))) (hfre->fsa (car hfre)))
	((and (string? (car hfre)) (null? (cdr hfre)))
	 (fsa-sequence (string->list (car hfre))))
	((or (prefixed? '(starting with) hfre)
	     (prefixed? '(prefixed by) hfre))
	 (fsa-concat (hfre-atom->fsa (cddr hfre)) (fsa-anything)))
	((or (prefixed? '(ending with) hfre)
	     (prefixed? '(suffixed by) hfre))
	 (fsa-concat (fsa-anything) (hfre-atom->fsa (cddr hfre))))
	((and (or (prefixed? '(with ? of) hfre) (prefixed? '(containing ? of) hfre))
	      (integer? (cadr hfre)))
	 (hfre-atom->fsa (append (cons (cadr hfre) '(of with)) (cdddr hfre))))
	((and (or (prefixed? '(with at least ? of) hfre)
		  (prefixed? '(containing at least ? of) hfre))
	      (integer? (cadddr hfre)))
	 (hfre-atom->fsa (append (cons (cadddr hfre) '(of with))
				 (cdr (cddddr hfre)))))
	((and (or (prefixed? '(with exactly ? of) hfre)
		  (prefixed? '(containing exactly ? of) hfre))
	      (integer? (caddr hfre)))
	 (hfre-atom->fsa-containing-exactly (caddr hfre) (cddddr hfre)))
	((or (prefixed? '(containing) hfre) (prefixed? '(with) hfre))
	 (fsa-containing (hfre-atom->fsa (cdr hfre))))
	((prefixed? '(not) hfre)
	 (fsa-complement (hfre-atom->fsa (cdr hfre))))
	((or (prefixed? '(maybe) hfre) (prefixed? '(usually) hfre))
	 (fsa-union (fsa-success) (hfre-atom->fsa (cdr hfre))))
	((prefixed? '(any number of) hfre)
	 (fsa-repeat (hfre-atom->fsa (cdddr hfre))))
	((prefixed? '(many of) hfre)
	 (fsa-repeat-nonempty (hfre-atom->fsa (cddr hfre))))
	((and (prefixed? '(at least ? of) hfre) (integer? (caddr hfre)))
	 (fsa-concat (hfre-atom->fsa (cddr hfre))
		     (fsa-repeat (hfre-atom->fsa (cddddr hfre)))))
	((prefixed? '(1 of) hfre) (hfre-atom->fsa (cddr hfre)))
	((and (prefixed? '(? of) hfre) (integer? (car hfre)))
	 (fsa-concat (hfre-atom->fsa (cddr hfre))
		     (hfre-atom->fsa (cons (- (car hfre) 1) (cdr hfre)))))
	((equal? '(anything) hfre) (fsa-anything))
	((equal? '(nothing) hfre) (fsa-success))
	((equal? '(any letter) hfre) (fsa-any-item))
	((equal? '(something) hfre) (fsa-repeat-nonempty (fsa-any-item)))
	(else (error "hfre->fsa: unknown human-friendly RE" hfre))))

