fasta.mlish (6819B)
1 #lang s-exp "../../mlish.rkt" 2 (require "../rackunit-typechecking.rkt") 3 4 (define +alu+ 5 (string-append "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" 6 "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" 7 "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" 8 "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" 9 "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" 10 "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" 11 "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA")) 12 13 (check-type +alu+ : String) 14 15 (define IUB 16 (hash #\a 0.27 #\c 0.12 #\g 0.12 #\t 0.27 #\B 0.02 17 #\D 0.02 #\H 0.02 #\K 0.02 #\M 0.02 #\N 0.02 18 #\R 0.02 #\S 0.02 #\V 0.02 #\W 0.02 #\Y 0.02)) 19 20 (check-type IUB : (Hash Char Float)) 21 22 (define HOMOSAPIEN 23 (hash #\a 0.3029549426680 #\c 0.1979883004921 24 #\g 0.1975473066391 #\t 0.3015094502008)) 25 26 (check-type HOMOSAPIEN : (Hash Char Float)) 27 28 (define line-length 60) 29 30 (check-type line-length : Int) 31 32 (define (repeat-fasta [header : String] [N : Int] [sequence : String] -> String) 33 (let* ([out (open-output-string)] 34 [len (string-length sequence)] 35 [buf (make-string (+ len line-length))]) 36 (string-copy! buf 0 sequence) 37 (string-copy! buf len sequence 0 line-length) 38 (write-string header out) 39 (let loop : String ([n N] [start 0]) 40 (if (> n 0) 41 (let ([end (+ start (min n line-length))]) 42 (write-string buf out start end) 43 (write-string "\n" out) 44 (loop (- n line-length) (if (> end len) (- end len) end))) 45 (get-output-string out))))) 46 47 (define IA 3877) 48 (define IC 29573) 49 (define IM 139968) 50 (define IM.0 (fx->fl IM)) 51 52 (define V 53 (for/vector ([id (in-range IM)]) 54 (modulo (+ IC (* id IA)) IM))) 55 56 (check-type V : (Vector Int)) 57 58 (define (random-next [cur : Int] -> Int) (vector-ref V cur)) 59 60 (check-type (tup 0 0.0) : (× Int Float)) 61 62 (check-type (in-hash IUB) : (Sequence (× Char Float))) 63 64 (define (make-lookup-table [frequency-table : (Hash Char Float)] -> String) 65 (let ([v (make-string IM)]) 66 (for/fold ([cs (tup 0 0.0)]) 67 ([k+v (in-hash frequency-table)]) 68 (match cs with 69 [c c. -> 70 (match k+v with 71 [key val -> 72 (let* ([c1. (fl+ c. (fl* IM.0 val))] 73 [c1 (inexact->exact (flceiling c1.))] 74 #;[b (char->integer key)]) 75 (for ([i (in-range c c1)]) (string-set! v i key)) 76 (tup c1 c1.))])])) 77 v)) 78 79 (define (n-randoms [buf : String][out : String-Port][lookup : String] 80 [to : Int][R : Int] -> Int) 81 (let loop : Int ([n 0] [R R]) 82 (if (< n to) 83 (let ([R (random-next R)]) 84 (string-set! buf n (string-ref lookup R)) 85 (loop (add1 n) R)) 86 (begin (write-string buf out 0 (add1 to)) R)))) 87 88 (define LF #\newline) 89 90 (define (make-line! [buf : String] [lookup : String] 91 [start : Int] [R : Int] -> Int) 92 (let ([end (+ start line-length)]) 93 (string-set! buf end LF) 94 (let loop : Int ([n start] [R R]) 95 (if (< n end) 96 (let ([R (random-next R)]) 97 (string-set! buf n (string-ref lookup R)) 98 (loop (add1 n) R)) 99 R)))) 100 101 (define (random-fasta [header : String] [N : Int] 102 [table : (Hash Char Float)] [R : Int] 103 -> (× Int String)) 104 (let* ([buf (make-string (add1 line-length))] 105 [out (open-output-string)] 106 [lookup-str (make-lookup-table table)] 107 [full-lines+last (quotient+remainder N line-length)] 108 [C 109 (let* ([len+1 (add1 line-length)] 110 [buflen (* len+1 IM)] 111 [buf2 (make-string buflen)]) 112 (let loop : String ([R R] [i 0]) 113 (if (< i buflen) 114 (loop (make-line! buf2 lookup-str i R) (+ i len+1)) 115 buf2)))]) 116 (string-set! buf line-length LF) 117 (write-string header out) 118 (tup 119 (match full-lines+last with 120 [full-lines last -> 121 (let loop : Int ([i full-lines] [R R]) 122 (if (> i IM) 123 (begin (write-string C out) (loop (- i IM) R)) 124 (let loop : Int ([i i] [R R]) 125 (cond 126 [(> i 0) 127 (loop 128 (sub1 i) 129 (n-randoms buf out lookup-str line-length R))] 130 [(> last 0) 131 (string-set! buf last LF) 132 (n-randoms buf out lookup-str last R)] 133 [else R]))))]) 134 (get-output-string out)))) 135 136 (define n 10) 137 138 (check-type (repeat-fasta ">ONE Homo sapiens alu\n" (* n 2) +alu+) : String 139 -> ">ONE Homo sapiens alu\nGGCCGGGCGCGGTGGCTCAC\n") 140 141 (define res1 142 (random-fasta ">TWO IUB ambiguity codes\n" (* n 3) IUB 42)) 143 144 (define res2 145 (match res1 with 146 [R str -> 147 (random-fasta ">THREE Homo sapiens frequency\n" (* n 5) HOMOSAPIEN R)])) 148 149 (check-type (proj res1 1) : String 150 -> ">TWO IUB ambiguity codes\nattRtBtaDtatVataKatgaatcccgDtY\n") 151 152 (check-type (proj res2 1) : String 153 -> (string-append ">THREE Homo sapiens frequency\n" 154 "atttgcggaaacgacaaatattaacacatcatcagagtaccataaaggga\n")) 155 156 (define (mk-fasta [n : Int] -> String) 157 (let 158 ([res1 (repeat-fasta ">ONE Homo sapiens alu\n" (* n 2) +alu+)] 159 [res2 (random-fasta ">TWO IUB ambiguity codes\n" (* n 3) IUB 42)] 160 [res3 161 (match res2 with 162 [R str -> 163 (random-fasta ">THREE Homo sapiens frequency\n" (* n 5) HOMOSAPIEN R)])]) 164 (string-append res1 (proj res2 1) (proj res3 1)))) 165 166 (provide mk-fasta) 167 168 (check-type (mk-fasta 100) 169 : String 170 -> (string-append 171 ">ONE Homo sapiens alu\n" 172 "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA\n" 173 "TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT\n" 174 "AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG\n" 175 "GCTGAGGCAGGAGAATCGCT\n" 176 ">TWO IUB ambiguity codes\n" 177 "attRtBtaDtatVataKatgaatcccgDtYtcccNaNgtRttNtatttatcctSaRataW\n" 178 "taatNtNctaatctttggMtMttKtYtMHagBttagcMKMttttcWaactgcSttgaaac\n" 179 "gtcatHagHtgtaHVgtcattatgRcaVcaatctcWgaNtttVaaYcaaaataYtgWgtt\n" 180 "acttMgtHHgagtattaaaKSgtBgacaaggSaaRttVaVDHttRgctagtaaacgaaac\n" 181 "ttcRNtgcatttSagBtHttNRaatgtctattcaSaRYcgtatSattttttttgaBgagD\n" 182 ">THREE Homo sapiens frequency\n" 183 "gaagacaggtgtaacgtgggaaaatctctagtaaagctttgatcagcggagacgcgatca\n" 184 "acagatcctttatatcgcgaaacttctctctatcagcgaactaaggagggcgacaatccg\n" 185 "agctgttccggaccaaaccctgaaagtacgactctgctctaataaagtcaaaacgtagaa\n" 186 "gactagatacaattatactgacaacaaaaaaaagttgcgtgcacaagagtacgatgtttg\n" 187 "accgccagttattatgacgagggtgagaacaagtcaggctaaagtagaagagcaccatag\n" 188 "gtatcagtttaactgagtaaatgcgaatgcgtgactttaaataagcctgcgtgtgtcaaa\n" 189 "actctacaatatctttgttatattattgaatcattctggatttgaggcagtggagcatac\n" 190 "tgtataaaataatttttcggtgggtcaaaaataaatttcaattaagacgttaaggataat\n" 191 "gaaatgactcaatctaaggt\n"))