openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

103 lines
3.5 KiB

from lm_eval.base import BaseLM
from lm_eval import evaluator, tasks
import torch, json, argparse
from examples.llama import LLaMa
from tinygrad.tensor import Tensor
from tinygrad import Device
class LLaMaAdaptor(BaseLM):
def __init__(
self,
model_size="7B",
model_gen=1,
device="",
quantize=False,
batch_size=1,
max_batch_size=1,
do_sample=False,
temperature=1.0,
checkpoint_path="",
tokenizer_path="",
):
super().__init__()
if batch_size is None:
batch_size = 1
self.do_sample = do_sample
self.temperature = temperature
self._device = device
assert isinstance(model_gen, int)
assert isinstance(model_size, str)
assert isinstance(batch_size, int)
assert isinstance(checkpoint_path, str)
assert isinstance(tokenizer_path, str)
self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
return cls(**kwargs, **additional_config)
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.llama.tokenizer.eos_id()
@property
def max_length(self):
return 1024
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
return 1
@property
def device(self):
return self._device
def tok_encode(self, string: str):
return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
def tok_decode(self, tokens):
return self.llama.tokenizer.decode(tokens)
def _model_call(self, inps):
Tensor.no_grad = True
return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
def greedy_until(self, requests):
continuations = []
for request in requests:
prompt, until = request[0], request[1]['until']
output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
continuations.append(output[len(prompt):])
return continuations
def _model_generate(self, context, max_length, eos_token_id):
raise NotImplementedError()
if __name__ == '__main__':
print(f"using {Device.DEFAULT} backend")
parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
args = parser.parse_args()
# run eval and exit
adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize,
checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
print(json.dumps(results, indent=2))