-
Notifications
You must be signed in to change notification settings - Fork 184
/
format.py
82 lines (59 loc) · 2.52 KB
/
format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import json
import numpy
import re
import torch
import torch_ac
import gymnasium as gym
import utils
def get_obss_preprocessor(obs_space):
# Check if obs_space is an image space
if isinstance(obs_space, gym.spaces.Box):
obs_space = {"image": obs_space.shape}
def preprocess_obss(obss, device=None):
return torch_ac.DictList({
"image": preprocess_images(obss, device=device)
})
# Check if it is a MiniGrid observation space
elif isinstance(obs_space, gym.spaces.Dict) and "image" in obs_space.spaces.keys():
obs_space = {"image": obs_space.spaces["image"].shape, "text": 100}
vocab = Vocabulary(obs_space["text"])
def preprocess_obss(obss, device=None):
return torch_ac.DictList({
"image": preprocess_images([obs["image"] for obs in obss], device=device),
"text": preprocess_texts([obs["mission"] for obs in obss], vocab, device=device)
})
preprocess_obss.vocab = vocab
else:
raise ValueError("Unknown observation space: " + str(obs_space))
return obs_space, preprocess_obss
def preprocess_images(images, device=None):
# Bug of Pytorch: very slow if not first converted to numpy array
images = numpy.array(images)
return torch.tensor(images, device=device, dtype=torch.float)
def preprocess_texts(texts, vocab, device=None):
var_indexed_texts = []
max_text_len = 0
for text in texts:
tokens = re.findall("([a-z]+)", text.lower())
var_indexed_text = numpy.array([vocab[token] for token in tokens])
var_indexed_texts.append(var_indexed_text)
max_text_len = max(len(var_indexed_text), max_text_len)
indexed_texts = numpy.zeros((len(texts), max_text_len))
for i, indexed_text in enumerate(var_indexed_texts):
indexed_texts[i, :len(indexed_text)] = indexed_text
return torch.tensor(indexed_texts, device=device, dtype=torch.long)
class Vocabulary:
"""A mapping from tokens to ids with a capacity of `max_size` words.
It can be saved in a `vocab.json` file."""
def __init__(self, max_size):
self.max_size = max_size
self.vocab = {}
def load_vocab(self, vocab):
self.vocab = vocab
def __getitem__(self, token):
if not token in self.vocab.keys():
if len(self.vocab) >= self.max_size:
raise ValueError("Maximum vocabulary capacity reached")
self.vocab[token] = len(self.vocab) + 1
return self.vocab[token]