Add some initial code · notjack.space/snoop@66f7a87

+397

2 changed files

expand all

markov.rkt

+212

markov.rkt

··· 1 + #lang racket/base 2 + 3 + 4 + (require math/distributions 5 + racket/match 6 + racket/stream 7 + racket/vector 8 + rebellion/streaming/transducer 9 + rebellion/collection/list) 10 + 11 + 12 + (struct transition-outcome (new-state reward) #:transparent) 13 + (struct markov-process (transition-function states action-space rewards)) 14 + 15 + 16 + (define (markov-process-outcomes process state action) 17 + ((markov-process-transition-function process) state action)) 18 + 19 + 20 + (define (markov-process-perform-action process state action) 21 + (sample (markov-process-outcomes process state action))) 22 + 23 + 24 + ; Gridworld, from Reinforcement Learning: An Introduction Chapter 4.1: 25 + ; 26 + ; T 1 2 3 27 + ; 4 5 6 7 28 + ; 8 9 10 11 29 + ; 12 13 14 T 30 + 31 + 32 + (define gridworld-states (vector-immutable 1 2 3 4 5 6 7 8 9 10 11 12 13 14 'terminal)) 33 + (define gridworld-actions (vector-immutable 'left 'up 'right 'down)) 34 + (define gridworld-rewards (vector-immutable -1 1)) 35 + 36 + 37 + (define gridworld 38 + (markov-process 39 + (λ (state action) 40 + (define new-state 41 + (match* (state action) 42 + [(1 'left) 'terminal] 43 + [(1 'up) 1] 44 + [(1 'right) 2] 45 + [(1 'down) 5] 46 + [(2 'left) 1] 47 + [(2 'up) 2] 48 + [(2 'right) 3] 49 + [(2 'down) 6] 50 + [(3 'left) 2] 51 + [(3 'up) 3] 52 + [(3 'right) 3] 53 + [(3 'down) 7] 54 + [(4 'left) 4] 55 + [(4 'up) 'terminal] 56 + [(4 'right) 5] 57 + [(4 'down) 8] 58 + [(5 'left) 4] 59 + [(5 'up) 1] 60 + [(5 'right) 6] 61 + [(5 'down) 9] 62 + [(6 'left) 5] 63 + [(6 'up) 2] 64 + [(6 'right) 7] 65 + [(6 'down) 10] 66 + [(7 'left) 6] 67 + [(7 'up) 3] 68 + [(7 'right) 7] 69 + [(7 'down) 11] 70 + [(8 'left) 8] 71 + [(8 'up) 4] 72 + [(8 'right) 9] 73 + [(8 'down) 12] 74 + [(9 'left) 8] 75 + [(9 'up) 5] 76 + [(9 'right) 10] 77 + [(9 'down) 13] 78 + [(10 'left) 9] 79 + [(10 'up) 6] 80 + [(10 'right) 11] 81 + [(10 'down) 14] 82 + [(11 'left) 10] 83 + [(11 'up) 7] 84 + [(11 'right) 11] 85 + [(11 'down) 'terminal] 86 + [(12 'left) 12] 87 + [(12 'up) 8] 88 + [(12 'right) 13] 89 + [(12 'down) 12] 90 + [(13 'left) 12] 91 + [(13 'up) 9] 92 + [(13 'right) 14] 93 + [(13 'down) 13] 94 + [(14 'left) 13] 95 + [(14 'up) 10] 96 + [(14 'right) 'terminal] 97 + [(14 'down) 14] 98 + [('terminal _) 'terminal])) 99 + (define reward (if (equal? new-state 'terminal) 1 -1)) 100 + (discrete-dist (list (transition-outcome new-state reward)))) 101 + gridworld-states 102 + (λ (_) gridworld-actions) 103 + gridworld-rewards)) 104 + 105 + 106 + (struct markov-policy (process choice-function)) 107 + 108 + 109 + (define (random-policy process) 110 + (markov-policy process (λ (_ actions) (discrete-dist actions)))) 111 + 112 + 113 + (define (greedy-policy process state-value-estimator #:discount-rate [discount-rate #e0.95]) 114 + 115 + (define (estimate-action-value state action) 116 + (define outcomes (markov-process-outcomes process state action)) 117 + (for*/sum ([s* (in-vector (markov-process-states process))] 118 + [r (in-vector (markov-process-rewards process))]) 119 + (* (pdf outcomes (transition-outcome s* r)) 120 + (+ r (* discount-rate (state-value-estimator s*)))))) 121 + 122 + (markov-policy 123 + process 124 + (λ (state actions) 125 + (define optimal-actions 126 + (transduce actions 127 + (taking-maxima #:key (λ (a) (estimate-action-value state a))) 128 + #:into into-list)) 129 + (discrete-dist optimal-actions)))) 130 + 131 + 132 + (define (markov-policy-actions policy state) 133 + (define process (markov-policy-process policy)) 134 + (define actions ((markov-process-action-space process) state)) 135 + ((markov-policy-choice-function policy) state actions)) 136 + 137 + 138 + (define (markov-policy-choose-action policy state) 139 + (sample (markov-policy-actions policy state))) 140 + 141 + 142 + (struct transition-step (action new-state reward) #:transparent) 143 + 144 + 145 + (define (markov-policy-next-step policy state) 146 + (define process (markov-policy-process policy)) 147 + (define action (markov-policy-choose-action policy state)) 148 + (match-define (transition-outcome new-state reward) 149 + (markov-process-perform-action process state action)) 150 + (transition-step action new-state reward)) 151 + 152 + 153 + (define (markov-policy-rollout policy init-state) 154 + (define step (markov-policy-next-step policy init-state)) 155 + (define new-state (transition-step-new-state step)) 156 + (stream-cons step (markov-policy-rollout policy new-state))) 157 + 158 + 159 + (define (markov-policy-table policy) 160 + (for/hash ([s (in-vector (markov-process-states (markov-policy-process policy)))]) 161 + (values s (markov-policy-actions policy s)))) 162 + 163 + 164 + (define (markov-policy-iterative-evaluation policy #:discount-rate [discount-rate #e0.95]) 165 + (define process (markov-policy-process policy)) 166 + (define states (markov-process-states process)) 167 + (define rewards (markov-process-rewards process)) 168 + (define action-space (markov-process-action-space process)) 169 + (define value-estimates (make-vector (vector-length states) 0)) 170 + (define state-indices 171 + (for/hash ([s (in-vector states)] 172 + [i (in-naturals)]) 173 + (values s i))) 174 + (define (estimate-state-value s) 175 + (vector-ref value-estimates (hash-ref state-indices s))) 176 + (let loop () 177 + (define any-changed? 178 + (for/fold ([any-changed? #false]) 179 + ([v (in-vector value-estimates)] 180 + [s (in-vector states)] 181 + [i (in-naturals)]) 182 + (define policy-actions (markov-policy-actions policy s)) 183 + (define v* 184 + (for/sum ([a (in-vector (action-space s))]) 185 + (define outcomes (markov-process-outcomes process s a)) 186 + (for*/sum ([s* (in-vector states)] 187 + [r (in-vector rewards)]) 188 + (define s*-v (estimate-state-value s*)) 189 + (* (pdf policy-actions a) 190 + (pdf outcomes (transition-outcome s* r)) 191 + (+ r (* discount-rate s*-v)))))) 192 + (vector-set! value-estimates i v*) 193 + (define delta (abs (- v v*))) 194 + (or any-changed? (> delta #e0.1)))) 195 + (when any-changed? 196 + (loop))) 197 + (for/hash ([s (in-vector states)] 198 + [v (in-vector value-estimates)]) 199 + (values s v))) 200 + 201 + 202 + (define value-estimates (markov-policy-iterative-evaluation (random-policy gridworld))) 203 + (define policy (greedy-policy gridworld (λ (s) (hash-ref value-estimates s)))) 204 + 205 + (for ([(s actions) (in-hash (markov-policy-table policy))]) 206 + (printf "state ~a: ~a\n" s actions)) 207 + 208 + #;(module+ main 209 + (printf "Starting in state ~a\n" 9) 210 + (for ([step (in-stream (stream-take (markov-policy-rollout policy 9) 10))]) 211 + (match-define (transition-step action new-state reward) step) 212 + (printf "Reached state ~a via action ~a, received reward ~a\n" new-state action reward)))

+185

transition-system.rkt

··· 1 + #lang racket/base 2 + 3 + 4 + (require racket/contract/base) 5 + 6 + 7 + (provide 8 + (contract-out 9 + [transition-system? (-> any/c boolean?)] 10 + [make-transition-system 11 + (->* (#:states (sequence/c any/c) 12 + #:actions (sequence/c any/c) 13 + #:transition-function (-> any/c any/c distribution?)) 14 + (#:applicable-action-selector (or/c (-> any/c set?) #false) 15 + #:name (or/c symbol? #false)) 16 + transition-system?)] 17 + [transition-system-states (-> transition-system? set?)] 18 + [transition-system-actions (-> transition-system? set?)] 19 + [transition-system-contains-state? (-> transition-system? any/c boolean?)] 20 + [transition-system-contains-action? (-> transition-system? any/c boolean?)] 21 + [transition-system-action-applicable? (-> transition-system? any/c any/c boolean?)] 22 + [transition-system-applicable-actions (-> transition-system? any/c set?)] 23 + [transition-system-outcomes (-> transition-system? any/c any/c distribution?)] 24 + [transition-system-perform-action (-> transition-system? any/c any/c any/c)])) 25 + 26 + 27 + (require math/distributions 28 + racket/match 29 + racket/sequence 30 + racket/set 31 + rebellion/custom-write) 32 + 33 + 34 + (struct transition-system (states actions transition-function applicable-action-selector name) 35 + #:omit-define-syntaxes 36 + #:constructor-name constructor:transition-system 37 + #:property prop:object-name (struct-field-index name) 38 + #:property prop:custom-write (make-named-object-custom-write 'transition-system)) 39 + 40 + 41 + (define (make-transition-system #:states states 42 + #:actions actions 43 + #:transition-function transition-function 44 + #:applicable-action-selector [selector #false] 45 + #:name [name #false]) 46 + (let* ([states (for/set ([s states]) s)] 47 + [actions (for/set ([a actions]) a)] 48 + [selector (or selector (λ (_) actions))]) 49 + (constructor:transition-system states actions transition-function selector name))) 50 + 51 + 52 + ; Gridworld, from Reinforcement Learning: An Introduction Chapter 4.1: 53 + ; 54 + ; T 1 2 3 55 + ; 4 5 6 7 56 + ; 8 9 10 11 57 + ; 12 13 14 T 58 + 59 + (define gridworld-states (set 'terminal 1 2 3 4 5 6 7 8 9 10 11 12 13 14)) 60 + (define gridworld-actions (set 'left 'up 'right 'down)) 61 + 62 + 63 + (define (gridworld-transition state action) 64 + (define new-state 65 + (match* (state action) 66 + [(1 'left) 'terminal] 67 + [(1 'up) 1] 68 + [(1 'right) 2] 69 + [(1 'down) 5] 70 + [(2 'left) 1] 71 + [(2 'up) 2] 72 + [(2 'right) 3] 73 + [(2 'down) 6] 74 + [(3 'left) 2] 75 + [(3 'up) 3] 76 + [(3 'right) 3] 77 + [(3 'down) 7] 78 + [(4 'left) 4] 79 + [(4 'up) 'terminal] 80 + [(4 'right) 5] 81 + [(4 'down) 8] 82 + [(5 'left) 4] 83 + [(5 'up) 1] 84 + [(5 'right) 6] 85 + [(5 'down) 9] 86 + [(6 'left) 5] 87 + [(6 'up) 2] 88 + [(6 'right) 7] 89 + [(6 'down) 10] 90 + [(7 'left) 6] 91 + [(7 'up) 3] 92 + [(7 'right) 7] 93 + [(7 'down) 11] 94 + [(8 'left) 8] 95 + [(8 'up) 4] 96 + [(8 'right) 9] 97 + [(8 'down) 12] 98 + [(9 'left) 8] 99 + [(9 'up) 5] 100 + [(9 'right) 10] 101 + [(9 'down) 13] 102 + [(10 'left) 9] 103 + [(10 'up) 6] 104 + [(10 'right) 11] 105 + [(10 'down) 14] 106 + [(11 'left) 10] 107 + [(11 'up) 7] 108 + [(11 'right) 11] 109 + [(11 'down) 'terminal] 110 + [(12 'left) 12] 111 + [(12 'up) 8] 112 + [(12 'right) 13] 113 + [(12 'down) 12] 114 + [(13 'left) 12] 115 + [(13 'up) 9] 116 + [(13 'right) 14] 117 + [(13 'down) 13] 118 + [(14 'left) 13] 119 + [(14 'up) 10] 120 + [(14 'right) 'terminal] 121 + [(14 'down) 14] 122 + [('terminal _) 'terminal])) 123 + (discrete-dist (list new-state))) 124 + 125 + 126 + (define gridworld 127 + (make-transition-system 128 + #:states gridworld-states 129 + #:actions gridworld-actions 130 + #:transition-function gridworld-transition 131 + #:name 'gridworld)) 132 + 133 + 134 + (define (transition-system-contains-state? system state) 135 + (set-member? (transition-system-states system) state)) 136 + 137 + 138 + (define (transition-system-contains-action? system action) 139 + (set-member? (transition-system-actions system) action)) 140 + 141 + 142 + (define (transition-system-action-applicable? system state action) 143 + (set-member? (transition-system-applicable-actions system state) action)) 144 + 145 + 146 + (define (transition-system-applicable-actions system state) 147 + (check-transition-system-contains-state #:who 'transition-system-applicable-actions system state) 148 + ((transition-system-applicable-action-selector system) state)) 149 + 150 + 151 + (define (transition-system-outcomes system state action) 152 + (check-transition-system-contains-state #:who 'transition-system-outcomes system state) 153 + (check-transition-system-contains-action #:who 'transition-system-outcomes system action) 154 + (check-transition-system-action-applicable #:who 'transition-system-outcomes system state action) 155 + ((transition-system-transition-function system) state action)) 156 + 157 + 158 + (define (transition-system-perform-action system state action) 159 + (check-transition-system-contains-state #:who 'transition-system-perform-action system state) 160 + (check-transition-system-contains-action #:who 'transition-system-perform-action system action) 161 + (check-transition-system-action-applicable #:who 'transition-system-perform-action 162 + system state action) 163 + (sample ((transition-system-transition-function system) state action))) 164 + 165 + 166 + (define (check-transition-system-contains-state #:who who system state) 167 + (unless (transition-system-contains-state? system state) 168 + (raise-arguments-error who "transition system does not contain given state" 169 + "state" state 170 + "system" system))) 171 + 172 + 173 + (define (check-transition-system-contains-action #:who who system action) 174 + (unless (transition-system-contains-action? system action) 175 + (raise-arguments-error who "transition system does not contain given action" 176 + "action" action 177 + "system" system))) 178 + 179 + 180 + (define (check-transition-system-action-applicable #:who who system state action) 181 + (unless (transition-system-action-applicable? system state action) 182 + (raise-arguments-error who "action is not applicable in given state" 183 + "action" action 184 + "state" state 185 + "system" system)))

Configure Feed

Configure Feed