Language Model
Ecco's central class. A wrapper around language models. We use it to run the language models and collect important data like input saliency and neuron activations.
A LM object is typically not created directly by users,
it is returned by ecco.from_pretrained()
.
Usage:
import ecco
lm = ecco.from_pretrained('distilgpt2')
output = lm.generate("Hello computer")
__call__(self, input_tokens)
special
Run a forward pass through the model. For when we don't care about output tokens. Currently only support activations collection. No attribution/saliency.
Usage:
inputs = lm.tokenizer("Hello computer", return_tensors="pt")
output = lm(inputs)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
input_tokens |
Tensor |
tuple returned by tokenizer( TEXT, return_tensors="pt"). contains key 'input_ids', its value tensor with input token ids. Shape is (batch_size, sequence_length). Also a key for masked tokens |
required |
attribution |
|
Flag indicating whether to calculate attribution/saliency |
required |
Source code in ecco/lm.py
def __call__(self,
# input_str: Optional[str] = '',
input_tokens: torch.Tensor,
# attribution: Optional[bool] = True,
):
"""
Run a forward pass through the model. For when we don't care about output tokens.
Currently only support activations collection. No attribution/saliency.
Usage:
```python
inputs = lm.tokenizer("Hello computer", return_tensors="pt")
output = lm(inputs)
```
Args:
input_tokens: tuple returned by tokenizer( TEXT, return_tensors="pt").
contains key 'input_ids', its value tensor with input token ids.
Shape is (batch_size, sequence_length).
Also a key for masked tokens
attribution: Flag indicating whether to calculate attribution/saliency
"""
if 'input_ids' not in input_tokens:
raise ValueError("Parameter 'input_tokens' needs to have the attribute 'input_ids'."
"Verify it was produced by the appropriate tokenizer with the "
"parameter return_tensors=\"pt\".")
# Move inputs to GPU if the model is on GPU
if self.model.device.type == "cuda" and input_tokens['input_ids'].device.type == "cpu":
input_tokens = self.to(input_tokens)
# Remove downstream. For now setting to batch length
n_input_tokens = len(input_tokens['input_ids'][0])
# model
if self.model_type == 'mlm':
output = self.model(**input_tokens, return_dict=True)
lm_head = None
elif self.model_type == 'causal':
output = self.model(**input_tokens, return_dict=True, use_cache=False)
lm_head = self.model.lm_head
elif self.model_type == 'enc-dec':
decoder_input_ids = self.model._prepare_decoder_input_ids_for_generation(input_tokens['input_ids'], None, None)
output = self.model(**input_tokens, decoder_input_ids=decoder_input_ids, return_dict=True, use_cache=False)
lm_head = self.model.lm_head
else:
raise NotImplemented(f"model type {self.model_type} not found")
# Turn activations from dict to a proper array
activations_dict = self._all_activations_dict
for layer_type, activations in activations_dict.items():
self.activations[layer_type] = activations_dict_to_array(activations)
encoder_hidden_states = getattr(output, "encoder_hidden_states", None)
decoder_hidden_states = getattr(output, "hidden_states", getattr(output, "decoder_hidden_states", None))
if self.model_type in ['causal', 'mlm']:
# First hidden state of the causal model is the embedding layer, skip it
# FIXME: do this in a cleaner way
embedding_states = decoder_hidden_states[0]
decoder_hidden_states = decoder_hidden_states[1:]
elif self.model_type == 'enc-dec':
embedding_states = encoder_hidden_states[0]
encoder_hidden_states = encoder_hidden_states[1:]
else:
raise NotImplemented(f"model type {self.model_type} not found")
tokens = []
for i in input_tokens['input_ids']:
token = self.tokenizer.convert_ids_to_tokens(i)
tokens.append(token)
attn = getattr(output, "attentions", None)
return OutputSeq(**{'tokenizer': self.tokenizer,
'token_ids': input_tokens['input_ids'],
'n_input_tokens': n_input_tokens,
# 'output_text': self.tokenizer.decode(input_ids),
'tokens': tokens,
'embedding_states': embedding_states,
'encoder_hidden_states': encoder_hidden_states,
'decoder_hidden_states': decoder_hidden_states,
'attention': attn,
# 'model_outputs': outputs,
# 'attribution': attributions,
'activations': self.activations,
'collect_activations_layer_nums': self.collect_activations_layer_nums,
'lm_head': lm_head,
'model_type': self.model_type,
'device': self.device})
__init__(self, model, tokenizer, model_name, config, collect_activations_flag=False, collect_activations_layer_nums=None, verbose=True, gpu=True)
special
Creates an LM object given a model and tokenizer.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
PreTrainedModel |
HuggingFace Transformers Pytorch language model. |
required |
tokenizer |
PreTrainedTokenizerFast |
The tokenizer associated with the model |
required |
model_name |
str |
The name of the model. Used to retrieve required settings (like what the embedding layer is called) |
required |
config |
Dict[str, Any] |
Configuration that has the information about the layer whose activations we will collect |
required |
collect_activations_flag |
Optional[bool] |
True if we want to collect activations |
False |
collect_activations_layer_nums |
Optional[List[int]] |
If collecting activations, we can use this parameter to indicate which layers to track. By default this would be None and we'd collect activations for all layers. |
None |
verbose |
Optional[bool] |
If True, model.generate() displays output tokens in HTML as they're generated. |
True |
gpu |
Optional[bool] |
Set to False to force using the CPU even if a GPU exists. |
True |
Source code in ecco/lm.py
def __init__(self,
model: transformers.PreTrainedModel,
tokenizer: transformers.PreTrainedTokenizerFast,
model_name: str,
config: Dict[str, Any],
collect_activations_flag: Optional[bool] = False,
collect_activations_layer_nums: Optional[List[int]] = None, # None --> collect for all layers
verbose: Optional[bool] = True,
gpu: Optional[bool] = True
):
"""
Creates an LM object given a model and tokenizer.
Args:
model: HuggingFace Transformers Pytorch language model.
tokenizer: The tokenizer associated with the model
model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called)
config: Configuration that has the information about the layer whose activations we will collect
collect_activations_flag: True if we want to collect activations
collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers
to track. By default this would be None and we'd collect activations for all layers.
verbose: If True, model.generate() displays output tokens in HTML as they're generated.
gpu: Set to False to force using the CPU even if a GPU exists.
"""
self.model_name = model_name
self.model = model
if torch.cuda.is_available() and gpu:
self.model = model.to('cuda')
self.device = 'cuda' if torch.cuda.is_available() \
and self.model.device.type == 'cuda' \
else 'cpu'
self.tokenizer = tokenizer
self.verbose = verbose
self._path = os.path.dirname(ecco.__file__)
# Neuron Activation
self.collect_activations_flag = collect_activations_flag
self.collect_activations_layer_nums = collect_activations_layer_nums
# For each model, this indicates the layer whose activations
# we will collect
self.model_config = config
try:
self.model_type = self.model_config['type']
embeddings_layer_name = self.model_config['embedding']
embed_retriever = attrgetter(embeddings_layer_name)
self.model_embeddings = embed_retriever(self.model)
self.collect_activations_layer_name_sig = self.model_config['activations'][0]
except KeyError:
raise ValueError(
f"The model '{self.model_name}' is not correctly configured in Ecco's 'model-config.yaml' file"
) from KeyError()
self._hooks = {}
self._reset()
self._attach_hooks(self.model)
generate(self, input_str, max_length=8, temperature=None, top_k=None, top_p=None, get_model_output=False, do_sample=None, attribution=True, generate=None)
Generate tokens in response to an input prompt. Works with Language models like GPT2, not masked language models like BERT.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
input_str |
str |
Input prompt. # TODO: accept batch of input strings |
required |
generate |
Optional[int] |
Number of tokens to generate. |
None |
max_length |
Optional[int] |
max length of sequence (input + output tokens) |
8 |
temperature |
Optional[float] |
Adjust the probability distibution of output candidate tokens. |
None |
top_k |
Optional[int] |
Specify top-k tokens to consider in decoding. Only used when do_sample is True. |
None |
top_p |
Optional[float] |
Specify top-p to consider in decoding. Only used when do_sample is True. |
None |
get_model_output |
Optional[bool] |
Flag to retrieve the final output object returned by the underlying language model. |
False |
do_sample |
Optional[bool] |
Decoding parameter. If set to False, the model always always chooses the highest scoring candidate output token. This may lead to repetitive text. If set to True, the model considers consults top_k and/or top_p to generate more itneresting output. |
None |
attribution |
Optional[bool] |
If True, the object will calculate input saliency/attribution. |
True |
Source code in ecco/lm.py
def generate(self, input_str: str,
max_length: Optional[int] = 8,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
get_model_output: Optional[bool] = False,
do_sample: Optional[bool] = None,
attribution: Optional[bool] = True,
generate: Optional[int] = None):
"""
Generate tokens in response to an input prompt.
Works with Language models like GPT2, not masked language models like BERT.
Args:
input_str: Input prompt. # TODO: accept batch of input strings
generate: Number of tokens to generate.
max_length: max length of sequence (input + output tokens)
temperature: Adjust the probability distibution of output candidate tokens.
top_k: Specify top-k tokens to consider in decoding. Only used when do_sample is True.
top_p: Specify top-p to consider in decoding. Only used when do_sample is True.
get_model_output: Flag to retrieve the final output object returned by the underlying language model.
do_sample: Decoding parameter. If set to False, the model always always
chooses the highest scoring candidate output
token. This may lead to repetitive text. If set to True, the model considers
consults top_k and/or top_p to generate more itneresting output.
attribution: If True, the object will calculate input saliency/attribution.
"""
assert self.model_type != 'mlm', "generate method not supported for MLMs"
top_k = top_k if top_k is not None else self.model.config.top_k
top_p = top_p if top_p is not None else self.model.config.top_p
temperature = temperature if temperature is not None else self.model.config.temperature
do_sample = do_sample if do_sample is not None else self.model.config.task_specific_params.get('text-generation', {}).get('do_sample', False)
pad_token_id = self.model.config.pad_token_id
eos_token_id = self.model.config.eos_token_id
# We needs this as a batch in order to collect activations.
input_ids = self.tokenizer(input_str, return_tensors="pt")['input_ids']
n_input_tokens = len(input_ids[0])
cur_len = n_input_tokens
if generate is not None:
max_length = n_input_tokens + generate
past = None
self.attributions = {}
outputs = []
if cur_len >= max_length:
raise ValueError(
"max_length set to {} while input token has more tokens ({}). Consider increasing max_length" \
.format(max_length, cur_len))
# Get attention mask and decoder input ids
if getattr(self.model, '_prepare_attention_mask_for_generation'):
assert len(input_ids.size()) == 2 # will break otherwise
attention_mask = self.model._prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
attention_mask = self.to(attention_mask)
else:
attention_mask = None
if self.model_type == 'enc-dec': # FIXME: only done because causal LMs like GPT-2 have the _prepare_decoder_input_ids_for_generation method but do not use it
assert len(input_ids.size()) == 2 # will break otherwise
decoder_input_ids = self.model._prepare_decoder_input_ids_for_generation(input_ids, None, None)
else:
decoder_input_ids = None
# Print output
if self.verbose:
viz_id = self.display_input_sequence(input_ids[0])
n_printed_tokens = n_input_tokens
while cur_len < max_length:
output_token_id, output = self._generate_token(encoder_input_ids=input_ids,
encoder_attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
past=past, # Note, this is not currently used
temperature=temperature,
top_k=top_k,
top_p=top_p,
do_sample=do_sample,
attribution_flag=attribution)
if get_model_output:
outputs.append(output)
if decoder_input_ids is not None:
assert len(decoder_input_ids.size()) == 2 # will break otherwise
decoder_input_ids = torch.cat([decoder_input_ids, torch.tensor([[output_token_id]])], dim=-1)
else:
input_ids = torch.cat([input_ids, torch.tensor([[output_token_id]])], dim=-1)
# Recomputing Attention Mask
if getattr(self.model, '_prepare_attention_mask_for_generation'):
assert len(input_ids.size()) == 2 # will break otherwise
attention_mask = self.model._prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
attention_mask = self.to(attention_mask)
if self.verbose:
offset = n_input_tokens if decoder_input_ids is not None else 0
generated_token_ids = decoder_input_ids if decoder_input_ids is not None else input_ids
# More than one token can be generated at once (e.g., automatic split/pad tokens)
while len(generated_token_ids[0]) + offset != n_printed_tokens:
self.display_token(
viz_id,
generated_token_ids[0][n_printed_tokens - offset].cpu().numpy(),
cur_len
)
n_printed_tokens += 1
cur_len = cur_len + 1
if output_token_id == eos_token_id:
break
# Turn activations from dict to a proper array
activations_dict = self._all_activations_dict
for layer_type, activations in activations_dict.items():
self.activations[layer_type] = activations_dict_to_array(activations)
encoder_hidden_states = getattr(output, "encoder_hidden_states", None)
decoder_hidden_states = getattr(output, "hidden_states", getattr(output, "decoder_hidden_states", None))
if self.model_type in ['causal', 'mlm', 'enc-dec']:
# First hidden state of the causal model is the embedding layer, skip it
# FIXME: do this in a cleaner way
embedding_states = decoder_hidden_states[0]
decoder_hidden_states = decoder_hidden_states[1:]
else:
raise NotImplemented(f"model type {self.model_type} not found")
if decoder_input_ids is not None:
assert len(decoder_input_ids.size()) == 2
all_token_ids = torch.cat([input_ids, decoder_input_ids], dim=-1)[0]
else:
all_token_ids = input_ids[0]
tokens = []
for i in all_token_ids:
token = self.tokenizer.decode([i])
tokens.append(token)
attributions = self.attributions
attn = getattr(output, "attentions", None)
return OutputSeq(**{'tokenizer': self.tokenizer,
'token_ids': all_token_ids.unsqueeze(0), # Add a batch dimension
'n_input_tokens': n_input_tokens,
'output_text': self.tokenizer.decode(all_token_ids),
'tokens': [tokens], # Add a batch dimension
'embedding_states': embedding_states,
'encoder_hidden_states': encoder_hidden_states,
'decoder_hidden_states': decoder_hidden_states,
'attention': attn,
'model_outputs': outputs,
'attribution': attributions,
'activations': self.activations,
'collect_activations_layer_nums': self.collect_activations_layer_nums,
'lm_head': self.model.lm_head,
'model_type': self.model_type,
'device': self.device})