Language Model

Ecco's central class. A wrapper around language models. We use it to run the language models and collect important data like input saliency and neuron activations.

A LM object is typically not created directly by users, it is returned by ecco.from_pretrained().

Usage:

import ecco

lm = ecco.from_pretrained('distilgpt2')
output = lm.generate("Hello computer")

`call(self, input_tokens)` `special`

Run a forward pass through the model. For when we don't care about output tokens. Currently only support activations collection. No attribution/saliency.

Usage:

inputs = lm.tokenizer("Hello computer", return_tensors="pt")
output = lm(inputs)

Parameters:

Name	Type	Description	Default
`input_tokens`	`Tensor`	tuple returned by tokenizer( TEXT, return_tensors="pt"). contains key 'input_ids', its value tensor with input token ids. Shape is (batch_size, sequence_length). Also a key for masked tokens	required
`attribution`		Flag indicating whether to calculate attribution/saliency	required

Source code in ecco/lm.py

def __call__(self,
             # input_str: Optional[str] = '',
             input_tokens: torch.Tensor,
             # attribution: Optional[bool] = True,
             ):
    """
    Run a forward pass through the model. For when we don't care about output tokens.
    Currently only support activations collection. No attribution/saliency.

    Usage:

    ```python
    inputs = lm.tokenizer("Hello computer", return_tensors="pt")
    output = lm(inputs)
    ```

    Args:
        input_tokens: tuple returned by tokenizer( TEXT, return_tensors="pt").
            contains key 'input_ids', its value tensor with input token ids.
            Shape is (batch_size, sequence_length).
            Also a key for masked tokens
        attribution: Flag indicating whether to calculate attribution/saliency
    """

    if 'input_ids' not in input_tokens:
        raise ValueError("Parameter 'input_tokens' needs to have the attribute 'input_ids'."
                         "Verify it was produced by the appropriate tokenizer with the "
                         "parameter return_tensors=\"pt\".")

    # Move inputs to GPU if the model is on GPU
    if self.model.device.type == "cuda" and input_tokens['input_ids'].device.type == "cpu":
        input_tokens = self.to(input_tokens)

    # Remove downstream. For now setting to batch length
    n_input_tokens = len(input_tokens['input_ids'][0])

    # model
    if self.model_type == 'mlm':
        output = self.model(**input_tokens, return_dict=True)
        lm_head = None
    elif self.model_type == 'causal':
        output = self.model(**input_tokens, return_dict=True, use_cache=False)
        lm_head = self.model.lm_head
    elif self.model_type == 'enc-dec':
        decoder_input_ids = self.model._prepare_decoder_input_ids_for_generation(input_tokens['input_ids'], None, None)
        output = self.model(**input_tokens, decoder_input_ids=decoder_input_ids, return_dict=True, use_cache=False)
        lm_head = self.model.lm_head
    else:
        raise NotImplemented(f"model type {self.model_type} not found")

    # Turn activations from dict to a proper array
    activations_dict = self._all_activations_dict
    for layer_type, activations in activations_dict.items():
        self.activations[layer_type] = activations_dict_to_array(activations)

    encoder_hidden_states = getattr(output, "encoder_hidden_states", None)
    decoder_hidden_states = getattr(output, "hidden_states", getattr(output, "decoder_hidden_states", None))

    if self.model_type in ['causal', 'mlm']:
        # First hidden state of the causal model is the embedding layer, skip it
        # FIXME: do this in a cleaner way
        embedding_states = decoder_hidden_states[0]
        decoder_hidden_states = decoder_hidden_states[1:]
    elif self.model_type == 'enc-dec':
        embedding_states = encoder_hidden_states[0]
        encoder_hidden_states = encoder_hidden_states[1:]
    else:
        raise NotImplemented(f"model type {self.model_type} not found")


    tokens = []
    for i in input_tokens['input_ids']:
        token = self.tokenizer.convert_ids_to_tokens(i)
        tokens.append(token)

    attn = getattr(output, "attentions", None)
    return OutputSeq(**{'tokenizer': self.tokenizer,
                        'token_ids': input_tokens['input_ids'],
                        'n_input_tokens': n_input_tokens,
                        # 'output_text': self.tokenizer.decode(input_ids),
                        'tokens': tokens,
                        'embedding_states': embedding_states,
                        'encoder_hidden_states': encoder_hidden_states,
                        'decoder_hidden_states': decoder_hidden_states,
                        'attention': attn,
                        # 'model_outputs': outputs,
                        # 'attribution': attributions,
                        'activations': self.activations,
                        'collect_activations_layer_nums': self.collect_activations_layer_nums,
                        'lm_head': lm_head,
                        'model_type': self.model_type,
                        'device': self.device})

`init(self, model, tokenizer, model_name, config, collect_activations_flag=False, collect_activations_layer_nums=None, verbose=True, gpu=True)` `special`

Creates an LM object given a model and tokenizer.

Parameters:

Name	Type	Description	Default
`model`	`PreTrainedModel`	HuggingFace Transformers Pytorch language model.	required
`tokenizer`	`PreTrainedTokenizerFast`	The tokenizer associated with the model	required
`model_name`	`str`	The name of the model. Used to retrieve required settings (like what the embedding layer is called)	required
`config`	`Dict[str, Any]`	Configuration that has the information about the layer whose activations we will collect	required
`collect_activations_flag`	`Optional[bool]`	True if we want to collect activations	`False`
`collect_activations_layer_nums`	`Optional[List[int]]`	If collecting activations, we can use this parameter to indicate which layers to track. By default this would be None and we'd collect activations for all layers.	`None`
`verbose`	`Optional[bool]`	If True, model.generate() displays output tokens in HTML as they're generated.	`True`
`gpu`	`Optional[bool]`	Set to False to force using the CPU even if a GPU exists.	`True`

Source code in ecco/lm.py

def __init__(self,
             model: transformers.PreTrainedModel,
             tokenizer: transformers.PreTrainedTokenizerFast,
             model_name: str,
             config: Dict[str, Any],
             collect_activations_flag: Optional[bool] = False,
             collect_activations_layer_nums: Optional[List[int]] = None,  # None --> collect for all layers
             verbose: Optional[bool] = True,
             gpu: Optional[bool] = True
             ):
    """
    Creates an LM object given a model and tokenizer.

    Args:
        model: HuggingFace Transformers Pytorch language model.
        tokenizer: The tokenizer associated with the model
        model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called)
        config: Configuration that has the information about the layer whose activations we will collect
        collect_activations_flag: True if we want to collect activations
        collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers
            to track. By default this would be None and we'd collect activations for all layers.
        verbose: If True, model.generate() displays output tokens in HTML as they're generated.
        gpu: Set to False to force using the CPU even if a GPU exists.
    """
    self.model_name = model_name
    self.model = model
    if torch.cuda.is_available() and gpu:
        self.model = model.to('cuda')

    self.device = 'cuda' if torch.cuda.is_available() \
                            and self.model.device.type == 'cuda' \
        else 'cpu'

    self.tokenizer = tokenizer
    self.verbose = verbose
    self._path = os.path.dirname(ecco.__file__)

    # Neuron Activation
    self.collect_activations_flag = collect_activations_flag
    self.collect_activations_layer_nums = collect_activations_layer_nums

    # For each model, this indicates the layer whose activations
    # we will collect
    self.model_config = config
    try:
        self.model_type = self.model_config['type']
        embeddings_layer_name = self.model_config['embedding']
        embed_retriever = attrgetter(embeddings_layer_name)
        self.model_embeddings = embed_retriever(self.model)
        self.collect_activations_layer_name_sig = self.model_config['activations'][0]
    except KeyError:
        raise ValueError(
               f"The model '{self.model_name}' is not correctly configured in Ecco's 'model-config.yaml' file"
        ) from KeyError()

    self._hooks = {}
    self._reset()
    self._attach_hooks(self.model)

`generate(self, input_str, max_length=8, temperature=None, top_k=None, top_p=None, get_model_output=False, do_sample=None, attribution=True, generate=None)`

Generate tokens in response to an input prompt. Works with Language models like GPT2, not masked language models like BERT.

Parameters:

Name	Type	Description	Default
`input_str`	`str`	Input prompt. # TODO: accept batch of input strings	required
`generate`	`Optional[int]`	Number of tokens to generate.	`None`
`max_length`	`Optional[int]`	max length of sequence (input + output tokens)	`8`
`temperature`	`Optional[float]`	Adjust the probability distibution of output candidate tokens.	`None`
`top_k`	`Optional[int]`	Specify top-k tokens to consider in decoding. Only used when do_sample is True.	`None`
`top_p`	`Optional[float]`	Specify top-p to consider in decoding. Only used when do_sample is True.	`None`
`get_model_output`	`Optional[bool]`	Flag to retrieve the final output object returned by the underlying language model.	`False`
`do_sample`	`Optional[bool]`	Decoding parameter. If set to False, the model always always chooses the highest scoring candidate output token. This may lead to repetitive text. If set to True, the model considers consults top_k and/or top_p to generate more itneresting output.	`None`
`attribution`	`Optional[bool]`	If True, the object will calculate input saliency/attribution.	`True`

Source code in ecco/lm.py

def generate(self, input_str: str,
             max_length: Optional[int] = 8,
             temperature: Optional[float] = None,
             top_k: Optional[int] = None,
             top_p: Optional[float] = None,
             get_model_output: Optional[bool] = False,
             do_sample: Optional[bool] = None,
             attribution: Optional[bool] = True,
             generate: Optional[int] = None):
    """
    Generate tokens in response to an input prompt.
    Works with Language models like GPT2, not masked language models like BERT.
    Args:
        input_str: Input prompt. # TODO: accept batch of input strings
        generate: Number of tokens to generate.
        max_length: max length of sequence (input + output tokens)
        temperature: Adjust the probability distibution of output candidate tokens.
        top_k: Specify top-k tokens to consider in decoding. Only used when do_sample is True.
        top_p: Specify top-p to consider in decoding. Only used when do_sample is True.
        get_model_output:  Flag to retrieve the final output object returned by the underlying language model.
        do_sample: Decoding parameter. If set to False, the model always always
            chooses the highest scoring candidate output
            token. This may lead to repetitive text. If set to True, the model considers
            consults top_k and/or top_p to generate more itneresting output.
        attribution: If True, the object will calculate input saliency/attribution.
    """

    assert self.model_type != 'mlm', "generate method not supported for MLMs"

    top_k = top_k if top_k is not None else self.model.config.top_k
    top_p = top_p if top_p is not None else self.model.config.top_p
    temperature = temperature if temperature is not None else self.model.config.temperature
    do_sample = do_sample if do_sample is not None else self.model.config.task_specific_params.get('text-generation', {}).get('do_sample', False)

    pad_token_id = self.model.config.pad_token_id
    eos_token_id = self.model.config.eos_token_id

    # We needs this as a batch in order to collect activations.
    input_ids = self.tokenizer(input_str, return_tensors="pt")['input_ids']
    n_input_tokens = len(input_ids[0])
    cur_len = n_input_tokens

    if generate is not None:
        max_length = n_input_tokens + generate

    past = None
    self.attributions = {}
    outputs = []

    if cur_len >= max_length:
        raise ValueError(
            "max_length set to {} while input token has more tokens ({}). Consider increasing max_length" \
                .format(max_length, cur_len))

    # Get attention mask and decoder input ids
    if getattr(self.model, '_prepare_attention_mask_for_generation'):
        assert len(input_ids.size()) == 2 # will break otherwise
        attention_mask = self.model._prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
        attention_mask = self.to(attention_mask)
    else:
        attention_mask = None

    if self.model_type == 'enc-dec': # FIXME: only done because causal LMs like GPT-2 have the _prepare_decoder_input_ids_for_generation method but do not use it
        assert len(input_ids.size()) == 2 # will break otherwise
        decoder_input_ids = self.model._prepare_decoder_input_ids_for_generation(input_ids, None, None)
    else:
        decoder_input_ids = None

    # Print output
    if self.verbose:
        viz_id = self.display_input_sequence(input_ids[0])
        n_printed_tokens = n_input_tokens

    while cur_len < max_length:
        output_token_id, output = self._generate_token(encoder_input_ids=input_ids,
                                                       encoder_attention_mask=attention_mask,
                                                       decoder_input_ids=decoder_input_ids,
                                                       past=past,  # Note, this is not currently used
                                                       temperature=temperature,
                                                       top_k=top_k,
                                                       top_p=top_p,
                                                       do_sample=do_sample,
                                                       attribution_flag=attribution)

        if get_model_output:
            outputs.append(output)

        if decoder_input_ids is not None:
            assert len(decoder_input_ids.size()) == 2 # will break otherwise
            decoder_input_ids = torch.cat([decoder_input_ids, torch.tensor([[output_token_id]])], dim=-1)
        else:
            input_ids = torch.cat([input_ids, torch.tensor([[output_token_id]])], dim=-1)

            # Recomputing Attention Mask
            if getattr(self.model, '_prepare_attention_mask_for_generation'):
                assert len(input_ids.size()) == 2 # will break otherwise
                attention_mask = self.model._prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
                attention_mask = self.to(attention_mask)

        if self.verbose:

            offset = n_input_tokens if decoder_input_ids is not None else 0
            generated_token_ids = decoder_input_ids if decoder_input_ids is not None else input_ids

            # More than one token can be generated at once (e.g., automatic split/pad tokens)
            while len(generated_token_ids[0]) + offset != n_printed_tokens:
                self.display_token(
                    viz_id,
                    generated_token_ids[0][n_printed_tokens - offset].cpu().numpy(),
                    cur_len
                )
                n_printed_tokens += 1

        cur_len = cur_len + 1

        if output_token_id == eos_token_id:
            break

    # Turn activations from dict to a proper array
    activations_dict = self._all_activations_dict
    for layer_type, activations in activations_dict.items():
        self.activations[layer_type] = activations_dict_to_array(activations)

    encoder_hidden_states = getattr(output, "encoder_hidden_states", None)
    decoder_hidden_states = getattr(output, "hidden_states", getattr(output, "decoder_hidden_states", None))

    if self.model_type in ['causal', 'mlm', 'enc-dec']:
        # First hidden state of the causal model is the embedding layer, skip it
        # FIXME: do this in a cleaner way
        embedding_states = decoder_hidden_states[0]
        decoder_hidden_states = decoder_hidden_states[1:]
    else:
        raise NotImplemented(f"model type {self.model_type} not found")

    if decoder_input_ids is not None:
        assert len(decoder_input_ids.size()) == 2
        all_token_ids = torch.cat([input_ids, decoder_input_ids], dim=-1)[0]
    else:
        all_token_ids = input_ids[0]

    tokens = []
    for i in all_token_ids:
        token = self.tokenizer.decode([i])
        tokens.append(token)

    attributions = self.attributions
    attn = getattr(output, "attentions", None)

    return OutputSeq(**{'tokenizer': self.tokenizer,
                        'token_ids': all_token_ids.unsqueeze(0),  # Add a batch dimension
                        'n_input_tokens': n_input_tokens,
                        'output_text': self.tokenizer.decode(all_token_ids),
                        'tokens': [tokens],  # Add a batch dimension
                        'embedding_states': embedding_states,
                        'encoder_hidden_states': encoder_hidden_states,
                        'decoder_hidden_states': decoder_hidden_states,
                        'attention': attn,
                        'model_outputs': outputs,
                        'attribution': attributions,
                        'activations': self.activations,
                        'collect_activations_layer_nums': self.collect_activations_layer_nums,
                        'lm_head': self.model.lm_head,
                        'model_type': self.model_type,
                        'device': self.device})

Language Model

__call__(self, input_tokens) special

__init__(self, model, tokenizer, model_name, config, collect_activations_flag=False, collect_activations_layer_nums=None, verbose=True, gpu=True) special

generate(self, input_str, max_length=8, temperature=None, top_k=None, top_p=None, get_model_output=False, do_sample=None, attribution=True, generate=None)

`call(self, input_tokens)` `special`

`init(self, model, tokenizer, model_name, config, collect_activations_flag=False, collect_activations_layer_nums=None, verbose=True, gpu=True)` `special`

`generate(self, input_str, max_length=8, temperature=None, top_k=None, top_p=None, get_model_output=False, do_sample=None, attribution=True, generate=None)`