Skip to content

Language Model

Ecco's central class. A wrapper around language models. We use it to run the language models and collect important data like input saliency and neuron activations.

A LM object is typically not created directly by users, it is returned by ecco.from_pretrained().

Usage:

import ecco

lm = ecco.from_pretrained('distilgpt2')
output = lm.generate("Hello computer")

__call__(self, input_tokens) special

Run a forward pass through the model. For when we don't care about output tokens. Currently only support activations collection. No attribution/saliency.

Usage:

inputs = lm.tokenizer("Hello computer", return_tensors="pt")
output = lm(inputs)

Parameters:

Name Type Description Default
input_tokens Tensor

tuple returned by tokenizer( TEXT, return_tensors="pt"). contains key 'input_ids', its value tensor with input token ids. Shape is (batch_size, sequence_length). Also a key for masked tokens

required
attribution

Flag indicating whether to calculate attribution/saliency

required
Source code in ecco/lm.py
def __call__(self,
             # input_str: Optional[str] = '',
             input_tokens: torch.Tensor,
             # attribution: Optional[bool] = True,
             ):
    """
    Run a forward pass through the model. For when we don't care about output tokens.
    Currently only support activations collection. No attribution/saliency.

    Usage:

    ```python
    inputs = lm.tokenizer("Hello computer", return_tensors="pt")
    output = lm(inputs)
    ```

    Args:
        input_tokens: tuple returned by tokenizer( TEXT, return_tensors="pt").
            contains key 'input_ids', its value tensor with input token ids.
            Shape is (batch_size, sequence_length).
            Also a key for masked tokens
        attribution: Flag indicating whether to calculate attribution/saliency
    """

    if 'input_ids' not in input_tokens:
        raise ValueError("Parameter 'input_tokens' needs to have the attribute 'input_ids'."
                         "Verify it was produced by the appropriate tokenizer with the "
                         "parameter return_tensors=\"pt\".")

    # Move inputs to GPU if the model is on GPU
    if self.model.device.type == "cuda" and input_tokens['input_ids'].device.type == "cpu":
        input_tokens = self.to(input_tokens)

    # Remove downstream. For now setting to batch length
    n_input_tokens = len(input_tokens['input_ids'][0])

    # model
    if self.model_type == 'mlm':
        output = self.model(**input_tokens, return_dict=True)
        lm_head = None
    elif self.model_type == 'causal':
        output = self.model(**input_tokens, return_dict=True, use_cache=False)
        lm_head = self.model.lm_head
    elif self.model_type == 'enc-dec':
        decoder_input_ids = self.model._prepare_decoder_input_ids_for_generation(input_tokens['input_ids'], None, None)
        output = self.model(**input_tokens, decoder_input_ids=decoder_input_ids, return_dict=True, use_cache=False)
        lm_head = self.model.lm_head
    else:
        raise NotImplemented(f"model type {self.model_type} not found")

    # Turn activations from dict to a proper array
    activations_dict = self._all_activations_dict
    for layer_type, activations in activations_dict.items():
        self.activations[layer_type] = activations_dict_to_array(activations)

    encoder_hidden_states = getattr(output, "encoder_hidden_states", None)
    decoder_hidden_states = getattr(output, "hidden_states", getattr(output, "decoder_hidden_states", None))

    if self.model_type in ['causal', 'mlm']:
        # First hidden state of the causal model is the embedding layer, skip it
        # FIXME: do this in a cleaner way
        embedding_states = decoder_hidden_states[0]
        decoder_hidden_states = decoder_hidden_states[1:]
    elif self.model_type == 'enc-dec':
        embedding_states = encoder_hidden_states[0]
        encoder_hidden_states = encoder_hidden_states[1:]
    else:
        raise NotImplemented(f"model type {self.model_type} not found")


    tokens = []
    for i in input_tokens['input_ids']:
        token = self.tokenizer.convert_ids_to_tokens(i)
        tokens.append(token)

    attn = getattr(output, "attentions", None)
    return OutputSeq(**{'tokenizer': self.tokenizer,
                        'token_ids': input_tokens['input_ids'],
                        'n_input_tokens': n_input_tokens,
                        # 'output_text': self.tokenizer.decode(input_ids),
                        'tokens': tokens,
                        'embedding_states': embedding_states,
                        'encoder_hidden_states': encoder_hidden_states,
                        'decoder_hidden_states': decoder_hidden_states,
                        'attention': attn,
                        # 'model_outputs': outputs,
                        # 'attribution': attributions,
                        'activations': self.activations,
                        'collect_activations_layer_nums': self.collect_activations_layer_nums,
                        'lm_head': lm_head,
                        'model_type': self.model_type,
                        'device': self.device})

__init__(self, model, tokenizer, model_name, config, collect_activations_flag=False, collect_activations_layer_nums=None, verbose=True, gpu=True) special

Creates an LM object given a model and tokenizer.

Parameters:

Name Type Description Default
model PreTrainedModel

HuggingFace Transformers Pytorch language model.

required
tokenizer PreTrainedTokenizerFast

The tokenizer associated with the model

required
model_name str

The name of the model. Used to retrieve required settings (like what the embedding layer is called)

required
config Dict[str, Any]

Configuration that has the information about the layer whose activations we will collect

required
collect_activations_flag Optional[bool]

True if we want to collect activations

False
collect_activations_layer_nums Optional[List[int]]

If collecting activations, we can use this parameter to indicate which layers to track. By default this would be None and we'd collect activations for all layers.

None
verbose Optional[bool]

If True, model.generate() displays output tokens in HTML as they're generated.

True
gpu Optional[bool]

Set to False to force using the CPU even if a GPU exists.

True
Source code in ecco/lm.py
def __init__(self,
             model: transformers.PreTrainedModel,
             tokenizer: transformers.PreTrainedTokenizerFast,
             model_name: str,
             config: Dict[str, Any],
             collect_activations_flag: Optional[bool] = False,
             collect_activations_layer_nums: Optional[List[int]] = None,  # None --> collect for all layers
             verbose: Optional[bool] = True,
             gpu: Optional[bool] = True
             ):
    """
    Creates an LM object given a model and tokenizer.

    Args:
        model: HuggingFace Transformers Pytorch language model.
        tokenizer: The tokenizer associated with the model
        model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called)
        config: Configuration that has the information about the layer whose activations we will collect
        collect_activations_flag: True if we want to collect activations
        collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers
            to track. By default this would be None and we'd collect activations for all layers.
        verbose: If True, model.generate() displays output tokens in HTML as they're generated.
        gpu: Set to False to force using the CPU even if a GPU exists.
    """
    self.model_name = model_name
    self.model = model
    if torch.cuda.is_available() and gpu:
        self.model = model.to('cuda')

    self.device = 'cuda' if torch.cuda.is_available() \
                            and self.model.device.type == 'cuda' \
        else 'cpu'

    self.tokenizer = tokenizer
    self.verbose = verbose
    self._path = os.path.dirname(ecco.__file__)

    # Neuron Activation
    self.collect_activations_flag = collect_activations_flag
    self.collect_activations_layer_nums = collect_activations_layer_nums

    # For each model, this indicates the layer whose activations
    # we will collect
    self.model_config = config
    try:
        self.model_type = self.model_config['type']
        embeddings_layer_name = self.model_config['embedding']
        embed_retriever = attrgetter(embeddings_layer_name)
        self.model_embeddings = embed_retriever(self.model)
        self.collect_activations_layer_name_sig = self.model_config['activations'][0]
    except KeyError:
        raise ValueError(
               f"The model '{self.model_name}' is not correctly configured in Ecco's 'model-config.yaml' file"
        ) from KeyError()

    self._hooks = {}
    self._reset()
    self._attach_hooks(self.model)

generate(self, input_str, max_length=8, temperature=None, top_k=None, top_p=None, get_model_output=False, do_sample=None, attribution=True, generate=None)

Generate tokens in response to an input prompt. Works with Language models like GPT2, not masked language models like BERT.

Parameters:

Name Type Description Default
input_str str

Input prompt. # TODO: accept batch of input strings

required
generate Optional[int]

Number of tokens to generate.

None
max_length Optional[int]

max length of sequence (input + output tokens)

8
temperature Optional[float]

Adjust the probability distibution of output candidate tokens.

None
top_k Optional[int]

Specify top-k tokens to consider in decoding. Only used when do_sample is True.

None
top_p Optional[float]

Specify top-p to consider in decoding. Only used when do_sample is True.

None
get_model_output Optional[bool]

Flag to retrieve the final output object returned by the underlying language model.

False
do_sample Optional[bool]

Decoding parameter. If set to False, the model always always chooses the highest scoring candidate output token. This may lead to repetitive text. If set to True, the model considers consults top_k and/or top_p to generate more itneresting output.

None
attribution Optional[bool]

If True, the object will calculate input saliency/attribution.

True
Source code in ecco/lm.py
def generate(self, input_str: str,
             max_length: Optional[int] = 8,
             temperature: Optional[float] = None,
             top_k: Optional[int] = None,
             top_p: Optional[float] = None,
             get_model_output: Optional[bool] = False,
             do_sample: Optional[bool] = None,
             attribution: Optional[bool] = True,
             generate: Optional[int] = None):
    """
    Generate tokens in response to an input prompt.
    Works with Language models like GPT2, not masked language models like BERT.
    Args:
        input_str: Input prompt. # TODO: accept batch of input strings
        generate: Number of tokens to generate.
        max_length: max length of sequence (input + output tokens)
        temperature: Adjust the probability distibution of output candidate tokens.
        top_k: Specify top-k tokens to consider in decoding. Only used when do_sample is True.
        top_p: Specify top-p to consider in decoding. Only used when do_sample is True.
        get_model_output:  Flag to retrieve the final output object returned by the underlying language model.
        do_sample: Decoding parameter. If set to False, the model always always
            chooses the highest scoring candidate output
            token. This may lead to repetitive text. If set to True, the model considers
            consults top_k and/or top_p to generate more itneresting output.
        attribution: If True, the object will calculate input saliency/attribution.
    """

    assert self.model_type != 'mlm', "generate method not supported for MLMs"

    top_k = top_k if top_k is not None else self.model.config.top_k
    top_p = top_p if top_p is not None else self.model.config.top_p
    temperature = temperature if temperature is not None else self.model.config.temperature
    do_sample = do_sample if do_sample is not None else self.model.config.task_specific_params.get('text-generation', {}).get('do_sample', False)

    pad_token_id = self.model.config.pad_token_id
    eos_token_id = self.model.config.eos_token_id

    # We needs this as a batch in order to collect activations.
    input_ids = self.tokenizer(input_str, return_tensors="pt")['input_ids']
    n_input_tokens = len(input_ids[0])
    cur_len = n_input_tokens

    if generate is not None:
        max_length = n_input_tokens + generate

    past = None
    self.attributions = {}
    outputs = []

    if cur_len >= max_length:
        raise ValueError(
            "max_length set to {} while input token has more tokens ({}). Consider increasing max_length" \
                .format(max_length, cur_len))

    # Get attention mask and decoder input ids
    if getattr(self.model, '_prepare_attention_mask_for_generation'):
        assert len(input_ids.size()) == 2 # will break otherwise
        attention_mask = self.model._prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
        attention_mask = self.to(attention_mask)
    else:
        attention_mask = None

    if self.model_type == 'enc-dec': # FIXME: only done because causal LMs like GPT-2 have the _prepare_decoder_input_ids_for_generation method but do not use it
        assert len(input_ids.size()) == 2 # will break otherwise
        decoder_input_ids = self.model._prepare_decoder_input_ids_for_generation(input_ids, None, None)
    else:
        decoder_input_ids = None

    # Print output
    if self.verbose:
        viz_id = self.display_input_sequence(input_ids[0])
        n_printed_tokens = n_input_tokens

    while cur_len < max_length:
        output_token_id, output = self._generate_token(encoder_input_ids=input_ids,
                                                       encoder_attention_mask=attention_mask,
                                                       decoder_input_ids=decoder_input_ids,
                                                       past=past,  # Note, this is not currently used
                                                       temperature=temperature,
                                                       top_k=top_k,
                                                       top_p=top_p,
                                                       do_sample=do_sample,
                                                       attribution_flag=attribution)

        if get_model_output:
            outputs.append(output)

        if decoder_input_ids is not None:
            assert len(decoder_input_ids.size()) == 2 # will break otherwise
            decoder_input_ids = torch.cat([decoder_input_ids, torch.tensor([[output_token_id]])], dim=-1)
        else:
            input_ids = torch.cat([input_ids, torch.tensor([[output_token_id]])], dim=-1)

            # Recomputing Attention Mask
            if getattr(self.model, '_prepare_attention_mask_for_generation'):
                assert len(input_ids.size()) == 2 # will break otherwise
                attention_mask = self.model._prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
                attention_mask = self.to(attention_mask)

        if self.verbose:

            offset = n_input_tokens if decoder_input_ids is not None else 0
            generated_token_ids = decoder_input_ids if decoder_input_ids is not None else input_ids

            # More than one token can be generated at once (e.g., automatic split/pad tokens)
            while len(generated_token_ids[0]) + offset != n_printed_tokens:
                self.display_token(
                    viz_id,
                    generated_token_ids[0][n_printed_tokens - offset].cpu().numpy(),
                    cur_len
                )
                n_printed_tokens += 1

        cur_len = cur_len + 1

        if output_token_id == eos_token_id:
            break

    # Turn activations from dict to a proper array
    activations_dict = self._all_activations_dict
    for layer_type, activations in activations_dict.items():
        self.activations[layer_type] = activations_dict_to_array(activations)

    encoder_hidden_states = getattr(output, "encoder_hidden_states", None)
    decoder_hidden_states = getattr(output, "hidden_states", getattr(output, "decoder_hidden_states", None))

    if self.model_type in ['causal', 'mlm', 'enc-dec']:
        # First hidden state of the causal model is the embedding layer, skip it
        # FIXME: do this in a cleaner way
        embedding_states = decoder_hidden_states[0]
        decoder_hidden_states = decoder_hidden_states[1:]
    else:
        raise NotImplemented(f"model type {self.model_type} not found")

    if decoder_input_ids is not None:
        assert len(decoder_input_ids.size()) == 2
        all_token_ids = torch.cat([input_ids, decoder_input_ids], dim=-1)[0]
    else:
        all_token_ids = input_ids[0]

    tokens = []
    for i in all_token_ids:
        token = self.tokenizer.decode([i])
        tokens.append(token)

    attributions = self.attributions
    attn = getattr(output, "attentions", None)

    return OutputSeq(**{'tokenizer': self.tokenizer,
                        'token_ids': all_token_ids.unsqueeze(0),  # Add a batch dimension
                        'n_input_tokens': n_input_tokens,
                        'output_text': self.tokenizer.decode(all_token_ids),
                        'tokens': [tokens],  # Add a batch dimension
                        'embedding_states': embedding_states,
                        'encoder_hidden_states': encoder_hidden_states,
                        'decoder_hidden_states': decoder_hidden_states,
                        'attention': attn,
                        'model_outputs': outputs,
                        'attribution': attributions,
                        'activations': self.activations,
                        'collect_activations_layer_nums': self.collect_activations_layer_nums,
                        'lm_head': self.model.lm_head,
                        'model_type': self.model_type,
                        'device': self.device})