Skip to content

Commit

Permalink
Merge pull request openai#1 from JanMatas/CNN
Browse files Browse the repository at this point in the history
Enable learning from pixels in the environment
  • Loading branch information
JanMatas authored Mar 20, 2018
2 parents 44ce256 + bf9e251 commit 093babe
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 101 deletions.
23 changes: 18 additions & 5 deletions src/Mico.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
import math
import sys
import numpy as np

micoUrdf = "../mico_description/urdf/mico.urdf"
micoUrdf = "/Users/janmatas/fyp/mico_description/urdf/mico.urdf"
class Mico:
def __init__(self,p, spawnPos=[0,0,0], enableDebug=False):
self.p = p
self.armId = self.p.loadURDF(micoUrdf,spawnPos, self.p.getQuaternionFromEuler([0,0,0]), flags=self.p.URDF_USE_SELF_COLLISION_EXCLUDE_PARENT)
self.numJoints = self.p.getNumJoints(self.armId)
assert (self.numJoints)

self.jointVelocities = [0] * self.numJoints
self.velocityControls = [0] * self.numJoints

self.goalPosition = [0,0,1]
self.goalPosition = [-0.5, -0, 0.2]
self.goalOrientation = [0,-math.pi,-math.pi/2]
self.goalGripper = 0
self.goalReached = False
self.goalEpsilon = 0.1

self.truncated = False
self.graspableObject = None
self.ikEnabled = True # If true, the arm will compute IK to goal position

Expand All @@ -35,28 +38,38 @@ def __init__(self,p, spawnPos=[0,0,0], enableDebug=False):
def computeIKInformation(self):
jointInformation = list(map(lambda i: self.p.getJointInfo(self.armId, i), range(self.numJoints)))
self.IKInfo = {}
assert all([len(jointInformation[i]) == 17 for i in range(self.numJoints)] )
self.IKInfo["lowerLimits"] = list(map(lambda i: jointInformation[i][8], range(8)))
self.IKInfo["upperLimits"] = list(map(lambda i: jointInformation[i][9], range(8)))
self.IKInfo["jointRanges"] = list(map(lambda i: jointInformation[i][9] - jointInformation[i][8], range(8)))
self.IKInfo["restPoses"] = [math.pi, math.pi * (1.7/3.0), 3.141592653589, 0.0, 0.0, 0.0, 0.0, 1.0]

self.IKInfo["restPoses"] = (3.1634643356751333, 1.8814690809007582, 2.9530639629442246, 1.3277818360585532, -2.8950796861071364, 2.9204677700686736, 1.3980617813922944, 1.3966983777123398)

self.IKInfo["solver"] = 0
self.IKInfo["jointDamping"] = [0.001] * self.numJoints
self.IKInfo["endEffectorLinkIndex"] = 6


def truncateVal(self, val, lowerBound, upperBound):
return min(max(val, lowerBound), upperBound)
new_val = min(max(val, lowerBound), upperBound)
self.truncated = val != new_val
return new_val

# Use either applyAction or setGoal, not both
def applyAction(self, action):

dx = action[0]
dy = action[1]
dz = action[2]
da = action[3]
grip_state = self.p.getLinkState(self.armId, 6, computeLinkVelocity=1)
grip_pos = np.array(grip_state[0])

self.goalPosition[0] = self.truncateVal(self.goalPosition[0] + dx, -0.5, -0.2)
self.goalPosition[1] = self.truncateVal(self.goalPosition[1] + dy, -0.3, 0.3)
self.goalPosition[2] = self.truncateVal(self.goalPosition[2] + dz, 0.05, 0.6)


self.goalGripper = self.truncateVal(self.goalGripper + da, 0, 1.4)


Expand Down
25 changes: 7 additions & 18 deletions src/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
obs = env.reset()
done = False

def policy(observation, desired_goal):
def policy_push(observation, desired_goal):
# Here you would implement your smarter policy. In this case,
# we just sample random actions.
grip_pos = observation[:3]
Expand All @@ -16,30 +16,19 @@ def policy(observation, desired_goal):
return (0,0,0,0)
elif grip_pos[2] > 0.13 or grip_pos[0]>object_pos[0]+0.03:
object_pos[1] += 0.1
return np.concatenate((object_pos - grip_pos, np.array([0.1]))) * 0.05
return np.concatenate((object_pos - grip_pos, np.array([0.1])))
else:
return (0,-0.05,0,0.1)
return data

def policy2(observation, desired_goal):
def policy_reach(observation):
# Here you would implement your smarter policy. In this case,
# we just sample random actions.
grip_pos = observation[:3]
print (grip_pos)
object_pos = [-0.2,-0.0,0]

return np.concatenate((object_pos - grip_pos, np.array([0.1]))) * 0.05
goal = observation[3:6]
return np.concatenate((goal - grip_pos, np.array([0.1])))

while not done:
action = policy2(obs['observation'], obs['desired_goal'])
action = policy_reach(obs)
obs, reward, done, info = env.step(action)
env.render()

# If we want, we can sub stitute a goal here and re-compute
# the reward. For instance, we can just pretend that the desired
# goal was what we achieved all along.
substitute_goal = obs['achieved_goal'].copy()
substitute_reward = env.compute_reward(
obs['achieved_goal'], substitute_goal, info)
print('reward is {}, substitute_reward is {}'.format(
reward, substitute_reward))
env.render()
70 changes: 44 additions & 26 deletions src/micoenv/bullet_robot_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# Pybullet imports
import pybullet as p
import pybullet_data

import time

import sys
sys.path.insert(1, "../bullet3/build_cmake/examples/pybullet")
Expand All @@ -25,7 +25,7 @@ def goal_distance(goal_a, goal_b):

timeStep = 1/240.0
class BulletRobotEnv(gym.GoalEnv):
def __init__(self, n_actions, n_substeps, useDone=True):
def __init__(self, n_actions, n_substeps, observation_type="low_dim", useDone=False, doneAfter=float("inf")):
self.viewer = None
self.n_substeps = n_substeps
self.metadata = {
Expand All @@ -37,20 +37,26 @@ def __init__(self, n_actions, n_substeps, useDone=True):
self.p = p
self.useDone = useDone
p.setAdditionalSearchPath(pybullet_data.getDataPath())

self.doneAfter = doneAfter
self.observation_type = observation_type
self.seed()
self._env_setup(initial_qpos=None)

self.goal = self._sample_goal()
obs = self._get_obs()
self.action_space = spaces.Box(-0.01, 0.01, shape=(n_actions,), dtype='float32')


self.observation_space = spaces.Dict(dict(
desired_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
achieved_goal=spaces.Box(-np.inf, np.inf, shape=obs['achieved_goal'].shape, dtype='float32'),
observation=spaces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32'),
))
self.action_space = spaces.Box(-1, 1, shape=(n_actions,), dtype='float32')
low_dim_space = spaces.Box(-np.inf, np.inf, shape=(6,), dtype='float32')
pixels_space = spaces.Box(-np.inf, np.inf, shape=(200,200,4), dtype='float32')
if observation_type == "low_dim":
self.observation_space = low_dim_space
elif observation_type == "pixels":
self.observation_space = pixels_space
elif observation_type == "combined":
self.observation_space = spaces.Dict(dict(
low_dim=low_dim_space,
pixels=pixels_space,
observation=saces.Box(-np.inf, np.inf, shape=obs['observation'].shape, dtype='float32')
))
else:
raise Exception("Unimplemented observation_type")

@property
def dt(self):
Expand All @@ -64,22 +70,29 @@ def seed(self, seed=None):
return [seed]

def step(self, action):
if self.numSteps == 0:
self.startTime = time.time()
action = np.clip(action, self.action_space.low, self.action_space.high)
self._set_action(action)
for i in range(self.n_substeps):
p.stepSimulation()
self._step_callback()
obs = self._get_obs()
self.numSteps += 1
done = False
info = {
'is_success': self._is_success(obs['achieved_goal'], self.goal),
}
reward = self.compute_reward(obs['achieved_goal'], self.goal, info)
if self.useDone and self.numSteps >= self._max_episode_steps:
return obs, reward, True, info

return obs, reward, done, info
obs = self._get_obs()
if self.observation_type == "her":
assert "achieved_goal" in obs and "desired_goal" in obs and "observation" in obs
info = {
'is_success': self._is_success_her(obs['achieved_goal'], self.goal),
}
reward = self.compute_reward(obs['achieved_goal'], self.goal)
return obs, reward, False, info
else:
reward = self._get_reward()
done = False
if self.useDone:
done = self._is_success() or self.numSteps > self.doneAfter
return obs, reward, done, {}

def reset(self):
# Attempt to reset the simulator. Since we randomize initial conditions, it
Expand All @@ -103,12 +116,15 @@ def close(self):

def render(self, mode='human'):
self._render_callback()
width, height = 500, 500
img = p.getCameraImage(width, height)
data = np.array(img[2], dtype=np.float).reshape(width,height,4) / 255
width, height = 200, 200
viewMatrix=p.computeViewMatrix([-0.5,0.5,0.5],[0,0,0],[0,0,1])
projMatrix=p.computeProjectionMatrixFOV(fov=90,aspect=4./3.,nearVal=0.01,farVal=2)
img = p.getCameraImage(width,height,viewMatrix,projMatrix)
# img = p.getCameraImage(width,height)
if mode == 'rgb_array':
return data
return np.array(img[2], dtype=np.float).reshape(width,height,4) / 255
elif mode == 'human':
data = np.array(img[2], dtype=np.float).reshape(width,height,4) / 255
cv2.imshow("test", data)
cv2.waitKey(1)
self._render_cleanup()
Expand All @@ -131,6 +147,8 @@ def compute_reward(self, achieved_goal, goal, info):



def timePerStep(self):
return (time.time() - self.startTime) / self.numSteps
# Extension methods
# ----------------------------

Expand Down
Loading

0 comments on commit 093babe

Please sign in to comment.