-
Notifications
You must be signed in to change notification settings - Fork 5
/
arguments.py
202 lines (178 loc) · 8.79 KB
/
arguments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from dataclasses import dataclass, field
from typing import List, Optional
import torch
from transformers import TrainingArguments
@dataclass
class CollaborativeArguments:
"""Configuration for CollaborativeOptimizer and its internals"""
target_batch_size: int = field(
default=16384,
metadata={"help": "Perform optimizer step after all peers collectively accumulate this many samples"},
)
matchmaking_time: float = field(
default=60.0, metadata={"help": "Averaging group will wait for stragglers for at most this many seconds"}
)
next_chunk_timeout: float = field(
default=60.0, metadata={"help": "Consider allreduce peer failed if it does not respond in this many seconds"}
)
averaging_timeout: float = field(
default=600.0, metadata={"help": "Give up on averaging step after this many seconds"}
)
offload_optimizer: bool = field(default=True, metadata={"help": "Whether or not to offload optimizer into RAM"})
delay_optimizer_step: bool = field(
default=True, metadata={"help": "Whether or not to run optimizer step in background"}
)
delay_grad_averaging: bool = field(
default=True, metadata={"help": "Whether or not to run gradient averaging in background"}
)
average_state_every: int = field(default=5, metadata={"help": "Average parameters every this many epochs"})
reuse_grad_buffers: bool = field(
default=True,
metadata={
"help": "Whether or not to use model's .grad buffers for accumulating gradients across local steps. This "
"optimization reduces GPU memory consumption but may result in incorrect gradients when using some "
"advanced techniques (e.g. changing loss scaler to a custom one)."
},
)
@dataclass
class HFTrainerArguments(TrainingArguments):
"""Arguments for huggingface/transformers.Trainer"""
per_device_train_batch_size: int = 1
per_device_eval_batch_size: int = 1
gradient_accumulation_steps: int = 1
learning_rate: float = 0.0025
total_steps: int = 31250 # total number of collaborative optimizer updates, used for learning rate schedule
warmup_steps: int = 3125
min_learning_rate: float = 1e-5 # learning rate after total_steps have passed
adam_beta1: float = 0.9
adam_beta2: float = 0.95
adam_epsilon: float = 1e-6
weight_decay: float = 0.01
max_grad_norm: float = 1.0 # clipping performed by the optimizer; trainer is modified to disable builtin clipping
clamp_value: float = 1e9 # no clipping by value
gradient_checkpointing: bool = False # can be enabled to save memory at the cost of ~30% slower training
fp16: bool = False # can be enabled depending on the device
max_sequence_length: int = 512
sequence_length_warmup_steps: int = 10_000
initial_sequence_length: Optional[int] = 128 # used only if warmup > 0, default = pad_to_multiple_of
pad_to_multiple_of: int = 32 # sequence length will be divisible by this value
output_dir: str = "outputs"
logging_steps: int = 100
# params that should *not* be changed*
do_train: bool = True
do_eval: bool = False
logging_first_step = True
dataloader_num_workers: int = 0 # temporary fix for https://github.com/huggingface/datasets/issues/3148
max_steps: int = 10 ** 30
save_steps: int = 10 ** 30
save_total_limit: int = 2
@property
def batch_size_per_step(self):
"""Compute the number of training sequences contributed by each .step() from this peer"""
total_batch_size_per_step = self.per_device_train_batch_size * self.gradient_accumulation_steps
if torch.cuda.device_count() > 0:
total_batch_size_per_step *= torch.cuda.device_count()
return total_batch_size_per_step
@dataclass
class TPUTrainerArguments(HFTrainerArguments):
num_tpus: int = 8 # the total number of TPU cores in use
wandb_project: str = "huggingface"
@property
def batch_size_per_step(self):
"""Compute the number of training sequences contributed by each .step() from this peer"""
return self.per_device_train_batch_size * self.gradient_accumulation_steps * self.num_tpus
@dataclass
class BasePeerArguments:
"""Base arguments that are used for both trainers and for auxiliary peers such as training monitor"""
run_id: str = field(metadata={"help": "A unique experiment name, used as prefix for all DHT keys"})
model_config_path: Optional[str] = field(
default="./tasks/mlm/model.json", metadata={"help": "Path to the model config"}
)
tokenizer_path: Optional[str] = field(
default="aubmindlab/bert-base-arabert", metadata={"help": "Path to the tokenizer"}
)
cache_dir: Optional[str] = field(default="./cache", metadata={"help": "Path to the cache"})
authorize: bool = field(default=True, metadata={"help": "Whether or not to use HF authorizer"})
client_mode: bool = field(
default=False,
metadata={"help": "If True, runs training without incoming connections, in a firewall-compatible mode"},
)
bandwidth: Optional[float] = field(
default=None,
metadata={"help": "Min(upload & download speed) in megabits/s, used to assign averaging tasks between peers"},
)
min_vector_size: int = 4_000_000 # minimum slice of gradients assigned to one reducer, should be same across peers
initial_peers: List[str] = field(
default_factory=list,
metadata={
"help": "Multiaddrs of the peers that will welcome you into the existing collaboration. "
"Example: /ip4/203.0.113.1/tcp/31337/p2p/XXXX /ip4/203.0.113.2/udp/7777/quic/p2p/YYYY"
},
)
use_ipfs: bool = field(
default=False,
metadata={
"help": "Use IPFS to find initial_peers. If enabled, you only need to provide /p2p/XXXX part of multiaddrs "
"for the initial_peers (no need to specify a particular IPv4/IPv6 address and port)"
},
)
host_maddrs: List[str] = field(
default_factory=lambda: ["/ip4/0.0.0.0/tcp/0"],
metadata={
"help": "Multiaddrs to listen for external connections from other p2p instances. "
"Defaults to all IPv4 interfaces with TCP protocol: /ip4/0.0.0.0/tcp/0"
},
)
announce_maddrs: List[str] = field(
default_factory=list,
metadata={"help": "Visible multiaddrs the host announces for external connections from other p2p instances"},
)
identity_path: Optional[str] = field(
default=None,
metadata={
"help": "Path to a pre-generated private key file. If defined, makes the peer ID deterministic. "
"May be generated using ``./p2p-keygen`` from ``go-libp2p-daemon``."
},
)
@dataclass
class TrainingPeerArguments(BasePeerArguments):
statistics_expiration: float = field(
default=600, metadata={"help": "Statistics will be removed if not updated in this many seconds"}
)
backup_every_epochs: Optional[int] = field(
default=None,
metadata={
"help": "Update training state backup on disk once in this many global steps "
"(default = do not update local state)"
},
)
state_path: str = field(
default="state.zip", metadata={"help": "Load this state upon init and when recovering from NaN parameters"}
)
@dataclass
class AuxiliaryPeerArguments(BasePeerArguments):
"""
Arguments for run_aux_peer.py that is responsible for connecting peers to one another, tracking
learning curves, assisting in all-reduce and uploading checkpoints to the hub
"""
refresh_period: float = field(default=10, metadata={"help": "Period (in seconds) for fetching the keys from DHT"})
wandb_project: Optional[str] = field(
default=None, metadata={"help": "Name of Weights & Biases project to report the training progress to"}
)
save_checkpoint_epoch_interval: int = field(
default=5, metadata={"help": "Frequency (in steps) of fetching and saving state from peers"}
)
repo_url: Optional[str] = field(
default=None, metadata={"help": "URL of Hugging Face Hub repository to upload the model and optimizer states"}
)
local_path: Optional[str] = field(
default="Repo", metadata={"help": "Path to local repository to store the model and optimizer states"}
)
upload_interval: Optional[float] = field(
default=None, metadata={"help": "Frequency (in seconds) of uploading the model to Hub"}
)
store_checkpoints: bool = field(default=True, metadata={"help": "If True, enables CheckpointHandler"})
assist_in_averaging: bool = field(
default=False, metadata={"help": "If True, this peer will facilitate averaging for other (training) peers"}
)
assist_refresh: float = field(default=1.0, metadata={"help": "Period (in seconds) for tryin to assist averaging"})