(require (planet "sxml.ss" ("lizorkin" "sxml.plt" 1 4))) (define lazy-doc (call-with-input-file "some-big-file.xml" (lambda (ip) (lazy:xml->sxml ip '()))))
(require (planet "sxml.ss" ("lizorkin" "sxml.plt" 1 4))) (let* ([ip (open-input-string "<html><p>hello</p><p>world</p></html>")] [lazy-doc (lazy:xml->sxml ip '())]) (display ((lazy:sxpath (list "html" "p")) lazy-doc)) (newline))
((p hello) #<struct:promise>)
(module gene-ontology mzscheme ;; This code shows how one might parse a large xml file progressively ;; by taking advantage of the lazy parsing in the sxml module. ;; ;; Input: an RDF file from the Gene Ontology (http://geneontology.org) that ;; conforms to the DTD found at: http://www.geneontology.org/dtd/go.dtd. ;; ;; Output: a sample parsing of all the terms in the RDF that ;; shows accession, name, and definition. ;; ;; I believe one can always download the latest copy of the Gene Ontology ;; by grabbing: ;; ;; http://archive.godatabase.org/latest-termdb/go_daily-termdb.rdf-xml.gz (require (planet "sxml.ss" ("lizorkin" "sxml.plt" 1 4)) (lib "list.ss") (lib "file.ss") (lib "pretty.ss")) (provide (all-defined)) ;; Our test code will just parse a term and print it out. We'll call this ;; at the end of our module. (define (test) (call-with-input-file* (vector-ref (current-command-line-arguments) 0) (lambda (ip) (fold-term-elts (lambda (term-elt acc) (pretty-print (term-elt->Term term-elt)) (newline)) (void) ip)))) ;; We'll say that a Term is a ;; ;; (make-Term i n d) ;; ;; where i, n are strings. d is either a string or void. This is a ;; simplification, since the real Gene Ontology provides a lot of ;; other interesting attributes (including hierarchical data). (define-struct Term (id name definition) #f) ;; fold-lazy-sxpath-list: (sxml Y -> Y) Y sxpath-sxml-stream -> Y ;; Given the particular kind of lazy-list given by sxml's lazy:sxpath's query, ;; does a fold across its structure. (define (fold-lazy-sxpath-list f acc lazy-list) (cond [(empty? lazy-list) acc] [else (fold-lazy-sxpath-list f (f (first lazy-list) acc) (force (second lazy-list)))])) ;; make-namespaced-tag: string string -> symbol ;; Builds up a namespaced tag from the namespace ns and the suffix. (define (make-namespaced-tag ns suffix) (string->symbol (string-append ns ":" suffix))) ;; go-tag: string -> symbol (define (go-tag suffix) (make-namespaced-tag "http://www.geneontology.org/dtds/go.dtd#" suffix)) ;; rdf-tag: string -> symbol (define (rdf-tag suffix) (make-namespaced-tag "http://www.w3.org/1999/02/22-rdf-syntax-ns#" suffix)) ;; term-elt: sxml-fragment -> Term ;; Given an sxml fragment element elt, extracts a Term. (define (term-elt->Term elt) (define name-query (sxpath (list (go-tag "name") "text()"))) (define defn-query (sxpath (list (go-tag "definition") "text()"))) (define id-query (sxpath (list (go-tag "accession") "text()"))) (define (first-or-void a-list) (cond [(empty? a-list) (void)] [else (first a-list)])) (make-Term (first (id-query elt)) (first (name-query elt)) (first-or-void (defn-query elt)))) ;; fold-term-elts: (elt Y) Y input-port -> Y ;; Given an input port ip whose contents conform to the gene ontology ;; RDF, folds f across every term element we can find in ip, using acc ;; as the initial accumulator. (define (fold-term-elts f acc ip) (define doc (lazy:xml->sxml ip '())) (define query (lazy:sxpath (list (go-tag "go") (rdf-tag "RDF") (go-tag "term")))) (fold-lazy-sxpath-list f acc (query doc))) ;; Finally, fire this test code up: (test))
| CookbookForm | |
|---|---|
| TopicType: | Recipe |
| ParentTopic: | XmlChapter |
| TopicOrder: | 999 |