Merge pull request openai#1 from JanMatas/CNN

Enable learning from pixels in the environment
JanMatas · Mar 20, 2018 · 093babe · 093babe
2 parents 44ce256 + bf9e251
commit 093babe
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 101 deletions.
diff --git a/src/Mico.py b/src/Mico.py
@@ -1,21 +1,24 @@
 import math
 import sys
+import numpy as np
 
-micoUrdf = "../mico_description/urdf/mico.urdf"
+micoUrdf = "/Users/janmatas/fyp/mico_description/urdf/mico.urdf"
 class Mico:
     def __init__(self,p, spawnPos=[0,0,0], enableDebug=False):
         self.p = p
         self.armId = self.p.loadURDF(micoUrdf,spawnPos, self.p.getQuaternionFromEuler([0,0,0]), flags=self.p.URDF_USE_SELF_COLLISION_EXCLUDE_PARENT)
         self.numJoints = self.p.getNumJoints(self.armId)
+        assert (self.numJoints)
+
         self.jointVelocities = [0] * self.numJoints
         self.velocityControls = [0] * self.numJoints
 
-        self.goalPosition = [0,0,1]
+        self.goalPosition = [-0.5, -0,  0.2]
         self.goalOrientation = [0,-math.pi,-math.pi/2]
         self.goalGripper = 0
         self.goalReached = False
         self.goalEpsilon = 0.1
-
+        self.truncated = False
         self.graspableObject = None
         self.ikEnabled = True # If true, the arm will compute IK to goal position
 
@@ -35,28 +38,38 @@ def __init__(self,p, spawnPos=[0,0,0], enableDebug=False):
     def computeIKInformation(self):
         jointInformation = list(map(lambda i: self.p.getJointInfo(self.armId, i), range(self.numJoints)))
         self.IKInfo = {}
+        assert all([len(jointInformation[i]) == 17 for i in range(self.numJoints)] )
         self.IKInfo["lowerLimits"] = list(map(lambda i: jointInformation[i][8], range(8)))
         self.IKInfo["upperLimits"] = list(map(lambda i: jointInformation[i][9], range(8)))
         self.IKInfo["jointRanges"] = list(map(lambda i: jointInformation[i][9] - jointInformation[i][8], range(8)))
-        self.IKInfo["restPoses"] = [math.pi, math.pi * (1.7/3.0), 3.141592653589, 0.0, 0.0, 0.0, 0.0, 1.0]
+
+        self.IKInfo["restPoses"] =         (3.1634643356751333, 1.8814690809007582, 2.9530639629442246, 1.3277818360585532, -2.8950796861071364, 2.9204677700686736, 1.3980617813922944, 1.3966983777123398)
 
         self.IKInfo["solver"] = 0
         self.IKInfo["jointDamping"] = [0.001] * self.numJoints
         self.IKInfo["endEffectorLinkIndex"] = 6
 
 
     def truncateVal(self, val, lowerBound, upperBound):
-        return min(max(val, lowerBound), upperBound)
+        new_val = min(max(val, lowerBound), upperBound)
+        self.truncated = val != new_val
+        return new_val
 
     # Use either applyAction or setGoal, not both
     def applyAction(self, action):
+
         dx = action[0]
         dy = action[1]
         dz = action[2]
         da = action[3]
+        grip_state = self.p.getLinkState(self.armId, 6, computeLinkVelocity=1)
+        grip_pos = np.array(grip_state[0])
+
         self.goalPosition[0] = self.truncateVal(self.goalPosition[0] + dx, -0.5, -0.2)
         self.goalPosition[1] = self.truncateVal(self.goalPosition[1] + dy, -0.3, 0.3)
         self.goalPosition[2] = self.truncateVal(self.goalPosition[2] + dz, 0.05, 0.6)
+
+
         self.goalGripper = self.truncateVal(self.goalGripper + da, 0, 1.4)
 
 

diff --git a/src/example.py b/src/example.py
@@ -6,7 +6,7 @@
 obs = env.reset()
 done = False
 
-def policy(observation, desired_goal):
+def policy_push(observation, desired_goal):
     # Here you would implement your smarter policy. In this case,
     # we just sample random actions.
     grip_pos = observation[:3]
@@ -16,30 +16,19 @@ def policy(observation, desired_goal):
         return (0,0,0,0)
     elif grip_pos[2] > 0.13 or grip_pos[0]>object_pos[0]+0.03:
         object_pos[1] += 0.1
-        return np.concatenate((object_pos - grip_pos, np.array([0.1]))) * 0.05
+        return np.concatenate((object_pos - grip_pos, np.array([0.1])))
     else:
         return (0,-0.05,0,0.1)
     return data
 
-def policy2(observation, desired_goal):
+def policy_reach(observation):
     # Here you would implement your smarter policy. In this case,
     # we just sample random actions.
     grip_pos = observation[:3]
-    print (grip_pos)
-    object_pos = [-0.2,-0.0,0]
-
-    return np.concatenate((object_pos - grip_pos, np.array([0.1]))) * 0.05
+    goal = observation[3:6]
+    return np.concatenate((goal - grip_pos, np.array([0.1])))
 
 while not done:
-    action = policy2(obs['observation'], obs['desired_goal'])
+    action = policy_reach(obs)
     obs, reward, done, info = env.step(action)
-    env.render()
-
-    # If we want, we can sub stitute a goal here and re-compute
-    # the reward. For instance, we can just pretend that the desired
-    # goal was what we achieved all along.
-    substitute_goal = obs['achieved_goal'].copy()
-    substitute_reward = env.compute_reward(
-        obs['achieved_goal'], substitute_goal, info)
-    print('reward is {}, substitute_reward is {}'.format(
-        reward, substitute_reward))
+    env.render()
diff --git a/src/micoenv/bullet_robot_env.py b/src/micoenv/bullet_robot_env.py
@@ -13,7 +13,7 @@
 # Pybullet imports
 import pybullet as p
 import pybullet_data
-
+import time
 
 import sys
 sys.path.insert(1, "../bullet3/build_cmake/examples/pybullet")
@@ -25,7 +25,7 @@ def goal_distance(goal_a, goal_b):
 
 timeStep = 1/240.0
 class BulletRobotEnv(gym.GoalEnv):
-    def __init__(self, n_actions, n_substeps, useDone=True):
+    def __init__(self, n_actions, n_substeps, observation_type="low_dim", useDone=False, doneAfter=float("inf")):
         self.viewer = None
         self.n_substeps = n_substeps
         self.metadata = {
@@ -37,20 +37,26 @@ def __init__(self, n_actions, n_substeps, useDone=True):
         self.p = p
         self.useDone = useDone
         p.setAdditionalSearchPath(pybullet_data.getDataPath())
-
+        self.doneAfter = doneAfter
+        self.observation_type = observation_type
         self.seed()
         self._env_setup(initial_qpos=None)
-
         self.goal = self._sample_goal()
-        obs = self._get_obs()
-        self.action_space = spaces.Box(-0.01, 0.01, shape=(n_actions,), dtype='float32')
-
-
-        self.observation_space = spaces.Dict(dict(
-            desired_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
-            achieved_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
-            observation=spaces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32'),
-        ))
+        self.action_space = spaces.Box(-1, 1, shape=(n_actions,), dtype='float32')
+        low_dim_space = spaces.Box(-np.inf, np.inf, shape=(6,), dtype='float32')
+        pixels_space = spaces.Box(-np.inf, np.inf, shape=(200,200,4), dtype='float32')
+        if observation_type == "low_dim":
+            self.observation_space = low_dim_space
+        elif observation_type == "pixels":
+            self.observation_space = pixels_space
+        elif observation_type == "combined":
+            self.observation_space = spaces.Dict(dict(
+                low_dim=low_dim_space,
+                pixels=pixels_space,
+                observation=saces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32')
+            ))
+        else:
+            raise Exception("Unimplemented observation_type")
 
     @property
     def dt(self):
@@ -64,22 +70,29 @@ def seed(self, seed=None):
         return [seed]
 
     def step(self, action):
+        if self.numSteps == 0:
+            self.startTime = time.time()
         action = np.clip(action, self.action_space.low, self.action_space.high)
         self._set_action(action)
         for i in range(self.n_substeps):
             p.stepSimulation()
         self._step_callback()
-        obs = self._get_obs()
         self.numSteps += 1
         done = False
-        info = {
-            'is_success': self._is_success(obs['achieved_goal'], self.goal),
-        }
-        reward = self.compute_reward(obs['achieved_goal'], self.goal, info)
-        if self.useDone and self.numSteps >= self._max_episode_steps:
-            return obs, reward, True, info
-
-        return obs, reward, done, info
+        obs = self._get_obs()
+        if self.observation_type == "her":
+            assert "achieved_goal" in obs and "desired_goal" in obs and "observation" in obs
+            info = {
+                'is_success': self._is_success_her(obs['achieved_goal'], self.goal),
+            }
+            reward = self.compute_reward(obs['achieved_goal'], self.goal)
+            return obs, reward, False, info
+        else:
+            reward = self._get_reward()
+            done = False
+            if self.useDone:
+                done = self._is_success() or self.numSteps > self.doneAfter
+            return obs, reward, done, {}
 
     def reset(self):
         # Attempt to reset the simulator. Since we randomize initial conditions, it
@@ -103,12 +116,15 @@ def close(self):
 
     def render(self, mode='human'):
         self._render_callback()
-        width, height = 500, 500
-        img = p.getCameraImage(width, height)
-        data = np.array(img[2], dtype=np.float).reshape(width,height,4) / 255
+        width, height = 200, 200
+        viewMatrix=p.computeViewMatrix([-0.5,0.5,0.5],[0,0,0],[0,0,1])
+        projMatrix=p.computeProjectionMatrixFOV(fov=90,aspect=4./3.,nearVal=0.01,farVal=2)
+        img = p.getCameraImage(width,height,viewMatrix,projMatrix)
+        # img = p.getCameraImage(width,height)
         if mode == 'rgb_array':
-            return data
+            return np.array(img[2], dtype=np.float).reshape(width,height,4) / 255
         elif mode == 'human':
+            data = np.array(img[2], dtype=np.float).reshape(width,height,4) / 255
             cv2.imshow("test", data)
             cv2.waitKey(1)
         self._render_cleanup()
@@ -131,6 +147,8 @@ def compute_reward(self, achieved_goal, goal, info):
 
 
 
+    def timePerStep(self):
+        return (time.time() - self.startTime) / self.numSteps
     # Extension methods
     # ----------------------------