from numpy import float64, nditer, uint64, zeros, ndarray from pandas import Series from os import path, remove from pickle import load from ArbID import ArbID from Signal import Signal from PipelineTimer import PipelineTimer from typing import List def tokenize_dictionary(a_timer: PipelineTimer, d: dict, force: bool = False, include_padding: bool = False, merge: bool = True, max_distance: float= 0.1): a_timer.start_function_time() for k, arb_id in d.items(): if not arb_id.static: if arb_id.padding and not force: print("\nTokenization already completed and forcing is turned off. Skipping...") return a_timer.start_iteration_time() get_composition(arb_id, include_padding, max_distance) a_timer.set_tang_to_composition() if merge: a_timer.start_iteration_time() merge_tokens(arb_id, max_distance) a_timer.set_composition_merge() a_timer.set_tokenization() # This is a greedy algorithm to cluster bit positions in a series of CAN payloads suspected of being part of a # continuous numerical time series. def get_composition_just_tang(this_tang: ndarray, include_padding=False, max_inversion_distance: float = 0.0): tokens: List[tuple] = [] padding = [] start_index = 0 currently_clustering = False big_endian = True last_bit_position = 0 # Consider each element in the TANG. The TANG is an ndarray with index being bit position from the # original CAN data. The cell value is the observed transition frequency for that bit position. for i, bit_position in enumerate(nditer(this_tang)): # Is this a padding bit? if bit_position <= 0.000001: padding.append(i) # Are we clustering padding bits? If so, proceed to the normal clustering logic. Else, do the following. if not include_padding: if currently_clustering: # This is padding, we're already clustering, and we're not clustering padding; save the token. tokens.append((start_index, i - 1)) currently_clustering = False start_index = i + 1 last_bit_position = bit_position continue # Are we still enlarging the current token? if currently_clustering: if bit_position >= last_bit_position and big_endian: pass elif bit_position <= last_bit_position and not big_endian: pass # Are we allowing inversions (max_inversion_distance > 0)? If so, check if this inversion is acceptable. elif abs(bit_position - last_bit_position) <= max_inversion_distance: pass # Is this the second bit position we need to establish the endian of the signal? elif start_index == i - 1: if bit_position >= last_bit_position: big_endian = True else: big_endian = False # This is an unacceptable transition frequency inversion, save the current token and start a new one else: tokens.append((start_index, i - 1)) start_index = i # We aren't currently clustering and we intend to cluster this bit position else: currently_clustering = True start_index = i last_bit_position = bit_position # We reached the last bit position while clustering. Add this final token. if currently_clustering: tokens.append((start_index, this_tang.__len__() - 1)) return tokens, padding def get_composition(arb_id: ArbID, include_padding=False, max_inversion_distance: float = 0.0): arb_id.tokenization, arb_id.padding = \ get_composition_just_tang(arb_id.tang, include_padding, max_inversion_distance) def merge_tokens_just_composition(tokens: list, this_tang, max_distance: float): verbose = False if tokens.__len__() < 2: # Make sure there's multiple tokens to marge pass else: # Editing data structures while iterating over them is a bad idea in Python # Instead, lets keep track of tokens we want to delete using remove_list. remove_list = [] last = None for i, token in enumerate(tokens): if verbose: print("Last:", last, "\tCurrent:", token) if last: # Are these tokens adjacent? if last[1] + 1 == token[0]: if verbose: print("\tAdjacent with distance of", abs(this_tang[last[1]] - this_tang[token[0]])) # Is the transition frequency of the adjacent bit positions less than the max distance threshold? if abs(this_tang[last[1]] - this_tang[token[0]]) <= max_distance: remove_list.append(last) token = (last[0], token[1]) tokens[i] = token if verbose: print("\t\tMerged into", token) last = token if remove_list: for token in remove_list: tokens.remove(token) if verbose: print("final tokenization", tokens) return tokens def merge_tokens(arb_id: ArbID, max_distance): # if arb_id.id == 292: # make this equal to the decimal value of an Arb ID in the data you want to see get merged # verbose = True # else: verbose = False if verbose: print("\nENTERING MERGE PHASE OF ARB ID", arb_id.id) print("STARTING TOKENS:", arb_id.tokenization) arb_id.tokenization = merge_tokens_just_composition(arb_id.tokenization, arb_id.tang, max_distance) # noinspection PyTypeChecker def generate_signals(a_timer: PipelineTimer, arb_id_dict: dict, signal_pickle_filename: str, normalize_strategy, force=False): if force and path.isfile(signal_pickle_filename): remove(signal_pickle_filename) if path.isfile(signal_pickle_filename): print("\nSignal generation already completed and forcing is turned off. Using pickled data...") return load(open(signal_pickle_filename, "rb")) a_timer.start_function_time() signal_dict = {} for k, arb_id in arb_id_dict.items(): if not arb_id.static: for token in arb_id.tokenization: a_timer.start_iteration_time() signal = Signal(k, token[0], token[1]) # Convert the binary ndarray to a list of string representations of each row temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]] temp2 = zeros((temp1.__len__(), 1), dtype=uint64) # convert each string representation to int for i, row in enumerate(temp1): temp2[i] = int(row, 2) # create an unsigned integer pandas.Series using the time index from this Arb ID's original data. signal.time_series = Series(temp2[:, 0], index=arb_id.original_data.index, dtype=float64) # Normalize the signal and update its meta-data signal.normalize_and_set_metadata(normalize_strategy) # add this signal to the signal dictionary which is keyed by Arbitration ID if k in signal_dict: signal_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal else: signal_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal} a_timer.set_token_to_signal() a_timer.set_signal_generation() return signal_dict