Reinforcement Learning example

No preview image

1 collaborator

Russ Abbott (Author)

WHAT IS IT?

(a general understanding of what the model is trying to show or explain)

HOW IT WORKS

(what rules the agents use to create the overall behavior of the model)

HOW TO USE IT

(how to use the model, including a description of each of the items in the Interface tab)

THINGS TO NOTICE

(suggested things for the user to notice while running the model)

THINGS TO TRY

(suggested things for the user to try to do (move sliders, switches, etc.) with the model)

EXTENDING THE MODEL

(suggested things to add or change in the Code tab to make the model more complicated, detailed, accurate, etc.)

NETLOGO FEATURES

(interesting or unusual features of NetLogo that the model uses, particularly in the Code tab; or where workarounds were needed for missing features)

RELATED MODELS

(models in the NetLogo Models Library and elsewhere which are of related interest)

CREDITS AND REFERENCES

(a reference to the model's URL on the web if it has one, as well as any other necessary credits, citations, and links)

Comments and Questions

Please start the discussion about this model! (You'll first need to log in.)

Click to Run Model

;; Adapted from a model by Joe Roop: http://ccl.northwestern.edu/netlogo/models/community/Reinforcement Learning Maze
;; Coppyright Russ Abbott (Russ.Abbott@gmail.com)
;; This work is licensed under the Creative Commons Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License.
;; To view a copy of the license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.

patches-own [
  Qlist
  reward
  qa-elts
]

breed [walkers walker]

breed [qa-labels qa-label]

globals [
  episode
  goal-color
  goal-patch
  start-patch
  Hlist
  north
  east
  south
  west
]

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

to setup
  ca
  set north 0
  set east 90
  set south 180
  set west 270
  set Hlist (list west north east south)

  setup-maze

  create-walkers 1 [ set shape "bug" set color red + 1 set size 0.8 move-to start-patch set heading 45]

  set episode 1
  set-current-plot "Ave Reward Per Episode"
end 

to setup-maze
  ask patches [
    let qa-elts-commands (list (list west 0.25)  (list north 0.35)  (list east 0.3)  (list south 0.40) )
    set qa-elts []
    set Qlist [0 0 0 0]
    sprout 4 [  ;; These are four turtles which display the qa values at the patch edges
      set size 0
      set heading east
      fd 0.1
      set heading south
      fd 0.1
      let command first qa-elts-commands
      set qa-elts-commands but-first qa-elts-commands
      set heading first command
      fd second command
      set label 0
      set qa-elts lput self qa-elts
    ]
  ]
  set-maze-elements
end 

to set-maze-elements
  clear-drawing
  ask patches [
    set pcolor default-color self
  ]

  set start-patch patch -4 -4
  ask start-patch [set pcolor black]

  set goal-patch patch 4 4
  set goal-color orange + 3
  ask goal-patch [set pcolor goal-color ]

  setup-blockades
  make-passage
  set-rewards

  ask patches [
    foreach qa-elts [
      qa-elt -> ask qa-elt [
        set hidden? pcolor = blue or pcolor = goal-color
      ]
    ]
  ]
end 

to setup-blockades
  ask patches with [pxcor = max-pxcor or pxcor = (- max-pxcor)  or pycor = max-pycor or pycor = (- max-pycor) or pycor = -1] [set pcolor blue]
  foreach (list patch -1 2 patch 1 1 patch -2 -3) [
    p -> ask p [
      set pcolor blue
      ask n-of (ifelse-value (p = patch -2 -3) [1] [2]) neighbors [set pcolor blue]]]
end 

to make-passage
  ask one-of patches with [pycor = -1 and pxcor > (- max-pxcor + 3 ) and pxcor < max-pxcor - 3]  [
    set pcolor default-color self
    ask patches with [pxcor = [pxcor] of myself and (pycor = [pycor] of myself + 1 or pycor = [pycor] of myself - 1) ] [
      set pcolor default-color self
    ]
    ask one-of patches with [(pycor = [pycor] of myself + 1 ) and (pxcor = [pxcor] of myself + 1 or pxcor = [pxcor] of myself - 1) ] [
      set pcolor default-color self
    ]
  ]
end 

to set-rewards
  ask patches [set reward ifelse-value (pcolor = blue)  [boundary-reward] [base-reward] ]
  ask goal-patch [ set reward goal-reward ]
  ask patches [sprout 1 [
    set size 0
    set label-color ifelse-value (myself = goal-patch) [black] [yellow + 2]
    set label [reward] of myself
    set heading east
    fd 0.2
    ]
    foreach (list item 0 qa-elts item 2 qa-elts) [qa-elt -> ask qa-elt [set heading south fd 0.1]]
  ]
end 

to-report default-color [a-patch]
  report green - 2 + ifelse-value is-odd? ([pxcor] of a-patch + [pycor] of a-patch) [0.2] [-0.2]
end 

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

to go
  if (episode > num-episodes) [stop]
  set trace? false
  one-trip
end 

to one-trip
  clear-drawing
  ;; Don't explore when running with trace-path
  let path episode-path ifelse-value trace? [0] [exploration-% / 100]
  let lng length path
  let lngsum sum path
  let avg-reward lngsum / lng
  plot avg-reward
  output-print (word " "  episode "; path-length: " lng "; avg-reward: "  precision avg-reward 2)
  set episode episode + 1
end 

to-report episode-path [explore-%]
  let r-episode []
  ask walkers [
    pen-up
    move-to start-patch
    if trace? [pen-down set pen-size 3]
    while [ [pcolor] of patch-here = default-color self or patch-here = start-patch] [
        let Qmax max Qlist ;--get max from the Qlist values of the current patch
        let dirp 0
        ifelse (random-float 1 < explore-%) [
          set heading one-of Hlist ;--pick random direction
          set dirp position heading Hlist ;--find dir's position in the Hlist array
          ] [
          set dirp one-of all-positions Qmax Qlist   ;; Qmax may appear multiple times in Qlist. Select one at random.
          set heading item dirp Hlist
        ]
        let Qa item dirp Qlist ;--find the value in Qlist with the same position as in the Hlist

        let r [reward] of patch-ahead 1
        set r-episode lput r r-episode

        ;-- Q-learning update function
        let Qmax' max [Qlist] of patch-ahead 1
        set Qa precision ( (1 - weight) * Qa + weight * (r + gamma * Qmax') ) 3 ;--perform Q-Learning
        set Qlist replace-item dirp Qlist Qa
        ask patch-here [ (foreach qa-elts Qlist [ [t q] -> ask t [ set label precision q 1]] )]
        fd 1
        ]
  ]
  report r-episode
end 

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

to-report all-positions [elt a-list]
  report all-positions' elt a-list 0
end 

to-report all-positions' [elt a-list n]
  if empty? a-list [report []]
  let positions all-positions' elt (but-first a-list) ( n + 1 )
  if (first a-list = elt) [set positions fput n positions]
  report positions
end 

to-report is-odd? [n]
  report n mod 2 = 1
end 

to-report list-to-string [a-list sep]
  report reduce [ [so-far next] -> (word so-far sep next)] a-list
end 

to-report second [a-list]
  report first but-first a-list
end

There is only one version of this model, created over 7 years ago by Russ Abbott.

Attached files

File	Type	Description	Last updated
2018-04-01_21-30-12.png	png	Model image	over 7 years ago, by Russ Abbott	Download

This model does not have any ancestors.

This model does not have any descendants.

NetLogo