www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

fasta.mlish (6819B)


      1 #lang s-exp "../../mlish.rkt"
      2 (require "../rackunit-typechecking.rkt")
      3 
      4 (define +alu+
      5   (string-append "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG"
      6                  "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA"
      7                  "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT"
      8                  "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA"
      9                  "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG"
     10                  "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC"
     11                  "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"))
     12 
     13 (check-type +alu+ : String)
     14 
     15 (define IUB
     16   (hash #\a 0.27 #\c 0.12 #\g 0.12 #\t 0.27 #\B 0.02
     17         #\D 0.02 #\H 0.02 #\K 0.02 #\M 0.02 #\N 0.02
     18         #\R 0.02 #\S 0.02 #\V 0.02 #\W 0.02 #\Y 0.02))
     19 
     20 (check-type IUB : (Hash Char Float))
     21 
     22 (define HOMOSAPIEN
     23   (hash #\a 0.3029549426680 #\c 0.1979883004921
     24         #\g 0.1975473066391 #\t 0.3015094502008))
     25 
     26 (check-type HOMOSAPIEN : (Hash Char Float))
     27 
     28 (define line-length 60)
     29 
     30 (check-type line-length : Int)
     31 
     32 (define (repeat-fasta [header : String] [N : Int] [sequence : String] -> String)
     33   (let* ([out (open-output-string)]
     34          [len (string-length sequence)]
     35          [buf (make-string (+ len line-length))])
     36   (string-copy! buf 0 sequence)
     37   (string-copy! buf len sequence 0 line-length)
     38   (write-string header out)
     39   (let loop : String ([n N] [start 0])
     40     (if (> n 0)
     41       (let ([end (+ start (min n line-length))])
     42         (write-string buf out start end)
     43         (write-string "\n" out)
     44         (loop (- n line-length) (if (> end len) (- end len) end)))
     45       (get-output-string out)))))
     46 
     47 (define IA 3877)
     48 (define IC 29573)
     49 (define IM 139968)
     50 (define IM.0 (fx->fl IM))
     51 
     52 (define V 
     53   (for/vector ([id (in-range IM)])
     54     (modulo (+ IC (* id IA)) IM)))
     55 
     56 (check-type V : (Vector Int))
     57 
     58 (define (random-next [cur : Int] -> Int) (vector-ref V cur))
     59 
     60 (check-type (tup 0 0.0) : (× Int Float))
     61 
     62 (check-type (in-hash IUB) : (Sequence (× Char Float)))
     63 
     64 (define (make-lookup-table [frequency-table : (Hash Char Float)] -> String)
     65   (let ([v (make-string IM)])
     66     (for/fold ([cs (tup 0 0.0)])
     67               ([k+v (in-hash frequency-table)])
     68       (match cs with
     69        [c c. ->
     70         (match k+v with
     71          [key val ->
     72           (let* ([c1. (fl+ c. (fl* IM.0 val))]
     73                  [c1 (inexact->exact (flceiling c1.))]
     74                  #;[b (char->integer key)])
     75             (for ([i (in-range c c1)]) (string-set! v i key))
     76             (tup c1 c1.))])]))
     77     v))
     78 
     79 (define (n-randoms [buf : String][out : String-Port][lookup : String]
     80                    [to : Int][R : Int] -> Int)
     81   (let loop : Int ([n 0] [R R])
     82     (if (< n to)
     83         (let ([R (random-next R)])
     84           (string-set! buf n (string-ref lookup R))
     85           (loop (add1 n) R))
     86         (begin (write-string buf out 0 (add1 to)) R))))
     87 
     88 (define LF #\newline)
     89 
     90 (define (make-line! [buf : String] [lookup : String]
     91                     [start : Int] [R : Int] -> Int)
     92   (let ([end (+ start line-length)])
     93     (string-set! buf end LF)
     94     (let loop : Int ([n start] [R R])
     95       (if (< n end)
     96           (let ([R (random-next R)])
     97             (string-set! buf n (string-ref lookup R))
     98             (loop (add1 n) R))
     99           R))))
    100 
    101 (define (random-fasta [header : String] [N : Int]
    102                       [table : (Hash Char Float)] [R : Int] 
    103                       -> (× Int String))
    104   (let* ([buf (make-string (add1 line-length))]
    105          [out (open-output-string)]
    106          [lookup-str (make-lookup-table table)]
    107          [full-lines+last (quotient+remainder N line-length)]
    108          [C
    109           (let* ([len+1 (add1 line-length)]
    110                  [buflen (* len+1 IM)]
    111                  [buf2 (make-string buflen)])
    112             (let loop : String ([R R] [i 0])
    113               (if (< i buflen)
    114                   (loop (make-line! buf2 lookup-str i R) (+ i len+1))
    115                   buf2)))])
    116   (string-set! buf line-length LF)
    117   (write-string header out)
    118   (tup
    119   (match full-lines+last with
    120    [full-lines last ->
    121    (let loop : Int ([i full-lines] [R R])
    122      (if (> i IM)
    123          (begin (write-string C out) (loop (- i IM) R))
    124          (let loop : Int ([i i] [R R])
    125               (cond 
    126                [(> i 0) 
    127                 (loop 
    128                  (sub1 i) 
    129                  (n-randoms buf out lookup-str line-length R))]
    130                [(> last 0) 
    131                 (string-set! buf last LF) 
    132                 (n-randoms buf out lookup-str last R)]
    133                [else R]))))])
    134   (get-output-string out))))
    135 
    136 (define n 10)
    137 
    138 (check-type (repeat-fasta ">ONE Homo sapiens alu\n" (* n 2) +alu+) : String
    139  -> ">ONE Homo sapiens alu\nGGCCGGGCGCGGTGGCTCAC\n")
    140 
    141 (define res1
    142   (random-fasta ">TWO IUB ambiguity codes\n" (* n 3) IUB 42))
    143 
    144 (define res2
    145   (match res1 with
    146    [R str ->   
    147     (random-fasta ">THREE Homo sapiens frequency\n" (* n 5) HOMOSAPIEN R)]))
    148 
    149 (check-type (proj res1 1) : String
    150  -> ">TWO IUB ambiguity codes\nattRtBtaDtatVataKatgaatcccgDtY\n")
    151 
    152 (check-type (proj res2 1) : String
    153  -> (string-append ">THREE Homo sapiens frequency\n"
    154       "atttgcggaaacgacaaatattaacacatcatcagagtaccataaaggga\n"))
    155 
    156 (define (mk-fasta [n : Int] -> String)
    157   (let 
    158    ([res1 (repeat-fasta ">ONE Homo sapiens alu\n" (* n 2) +alu+)]
    159     [res2 (random-fasta ">TWO IUB ambiguity codes\n" (* n 3) IUB 42)]
    160     [res3
    161      (match res2 with
    162       [R str ->   
    163        (random-fasta ">THREE Homo sapiens frequency\n" (* n 5) HOMOSAPIEN R)])])
    164    (string-append res1 (proj res2 1) (proj res3 1))))
    165 
    166 (provide mk-fasta)
    167 
    168 (check-type (mk-fasta 100) 
    169   : String
    170   -> (string-append  
    171       ">ONE Homo sapiens alu\n"
    172       "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA\n"
    173       "TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT\n"
    174       "AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG\n"
    175       "GCTGAGGCAGGAGAATCGCT\n"
    176       ">TWO IUB ambiguity codes\n"
    177       "attRtBtaDtatVataKatgaatcccgDtYtcccNaNgtRttNtatttatcctSaRataW\n"
    178       "taatNtNctaatctttggMtMttKtYtMHagBttagcMKMttttcWaactgcSttgaaac\n"
    179       "gtcatHagHtgtaHVgtcattatgRcaVcaatctcWgaNtttVaaYcaaaataYtgWgtt\n"
    180       "acttMgtHHgagtattaaaKSgtBgacaaggSaaRttVaVDHttRgctagtaaacgaaac\n"
    181       "ttcRNtgcatttSagBtHttNRaatgtctattcaSaRYcgtatSattttttttgaBgagD\n"
    182       ">THREE Homo sapiens frequency\n"
    183       "gaagacaggtgtaacgtgggaaaatctctagtaaagctttgatcagcggagacgcgatca\n"
    184       "acagatcctttatatcgcgaaacttctctctatcagcgaactaaggagggcgacaatccg\n"
    185       "agctgttccggaccaaaccctgaaagtacgactctgctctaataaagtcaaaacgtagaa\n"
    186       "gactagatacaattatactgacaacaaaaaaaagttgcgtgcacaagagtacgatgtttg\n"
    187       "accgccagttattatgacgagggtgagaacaagtcaggctaaagtagaagagcaccatag\n"
    188       "gtatcagtttaactgagtaaatgcgaatgcgtgactttaaataagcctgcgtgtgtcaaa\n"
    189       "actctacaatatctttgttatattattgaatcattctggatttgaggcagtggagcatac\n"
    190       "tgtataaaataatttttcggtgggtcaaaaataaatttcaattaagacgttaaggataat\n"
    191       "gaaatgactcaatctaaggt\n"))