You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
2.0 KiB
83 lines
2.0 KiB
from extra.hcqfuzz.spec import TestSpec
|
|
import random
|
|
|
|
bert_train_params = {
|
|
"DEFAULT_FLOAT": "HALF",
|
|
"SUM_DTYPE": "HALF",
|
|
"GPUS": 6,
|
|
"BS": 96,
|
|
"EVAL_BS": 96,
|
|
"FUSE_ARANGE": 1,
|
|
"FUSE_ARANGE_UINT": 0,
|
|
"BASEDIR": "/raid/datasets/wiki",
|
|
}
|
|
|
|
class TrainBert(TestSpec):
|
|
def prepare(self, dev, seed):
|
|
random.seed(seed)
|
|
|
|
self.env = {
|
|
**bert_train_params,
|
|
"IGNORE_BEAM_CACHE": 1,
|
|
"BEAM": 5,
|
|
"BEAM_UOPS_MAX": 10000,
|
|
"BEAM_UPCAST_MAX": 256,
|
|
"BEAM_LOCAL_MAX": 1024,
|
|
"BEAM_MIN_PROGRESS": 5,
|
|
"IGNORE_JIT_FIRST_BEAM": 1,
|
|
"LOGMLPERF": 0,
|
|
"SEED": seed,
|
|
}
|
|
|
|
self.cmd = "python3 examples/mlperf/model_train.py"
|
|
self.timeout = 7 * 60 * 60 # 7 hours
|
|
|
|
def get_exec_state(self): return self.env, self.cmd, self.timeout
|
|
|
|
class TrainBertShort(TestSpec):
|
|
def prepare(self, dev, seed):
|
|
random.seed(seed)
|
|
|
|
self.env = {
|
|
**bert_train_params,
|
|
"IGNORE_BEAM_CACHE": 1,
|
|
"BEAM": 5,
|
|
"BEAM_UOPS_MAX": 10000,
|
|
"BEAM_UPCAST_MAX": 256,
|
|
"BEAM_LOCAL_MAX": 1024,
|
|
"BEAM_MIN_PROGRESS": 5,
|
|
"IGNORE_JIT_FIRST_BEAM": 1,
|
|
"SEED": seed,
|
|
"BENCHMARK": 4096,
|
|
"JIT": 2
|
|
}
|
|
|
|
self.cmd = "python3 examples/mlperf/model_train.py"
|
|
self.timeout = 2 * 60 * 60 # 2 hours
|
|
|
|
def get_exec_state(self): return self.env, self.cmd, self.timeout
|
|
|
|
class BertBeam(TestSpec):
|
|
def prepare(self, dev, seed):
|
|
random.seed(seed)
|
|
|
|
self.env = {
|
|
**bert_train_params,
|
|
"IGNORE_BEAM_CACHE": 1,
|
|
"BEAM": random.choice([1, 2, 3, 4, 5]),
|
|
"BEAM_UOPS_MAX": 10000,
|
|
"BEAM_UPCAST_MAX": 256,
|
|
"BEAM_LOCAL_MAX": 1024,
|
|
"BEAM_MIN_PROGRESS": 5,
|
|
"IGNORE_JIT_FIRST_BEAM": 1,
|
|
"SEED": seed,
|
|
"RESET_STEP": 1,
|
|
"BENCHMARK": 10,
|
|
"BERT_LAYERS": 2,
|
|
"SEED": seed,
|
|
}
|
|
|
|
self.cmd = "python3 examples/mlperf/model_train.py"
|
|
self.timeout = 1 * 60 * 60 # 1 hour
|
|
|
|
def get_exec_state(self): return self.env, self.cmd, self.timeout
|
|
|