diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..5c98b42 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,2 @@ +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/CAN_Reverse_Engineering_git.iml b/.idea/CAN_Reverse_Engineering_git.iml new file mode 100644 index 0000000..24430d8 --- /dev/null +++ b/.idea/CAN_Reverse_Engineering_git.iml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..86f8151 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e2e3625 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Pipeline/ArbID.py b/Pipeline/ArbID.py old mode 100644 new mode 100755 diff --git a/Pipeline/J1979.py b/Pipeline/J1979.py old mode 100644 new mode 100755 diff --git a/Pipeline/LexicalAnalysis.py b/Pipeline/LexicalAnalysis.py old mode 100644 new mode 100755 diff --git a/Pipeline/Main.py b/Pipeline/Main.py old mode 100644 new mode 100755 index b2f8d4c..14a4c6c --- a/Pipeline/Main.py +++ b/Pipeline/Main.py @@ -9,190 +9,202 @@ from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster from PipelineTimer import PipelineTimer +i = 0 +j = 0 # File names for the on-disc data input and output. # Input: -can_data_filename: str = 'drive_runway_afit.log' -# can_data_filename: str = 'loggerProgram0.log' +#can_data_filename: str = 'drive_runway_afit.log' +can_data_filename: str = 'loggerProgram0.log' -# Output: -output_folder: str = 'output' -pickle_arb_id_filename: str = 'pickleArbIDs.p' -pickle_j1979_filename: str = 'pickleJ1979.p' -pickle_signal_filename: str = 'pickleSignals.p' -pickle_subset_filename: str = 'pickleSubset.p' -csv_correlation_filename: str = 'subset_correlation_matrix.csv' -pickle_j1979_correlation: str = 'pickleJ1979_correlation.p' -pickle_clusters_filename: str = 'pickleClusters.p' -pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p' -csv_all_signals_filename: str = 'complete_correlation_matrix.csv' -pickle_timer_filename: str = 'pickleTimer.p' +while i < 51: + if i == 50 and j < 50: #i need to optimize this and redesign it + j += 1 + i = 0 + elif i == 50 and j == 50: + i = 51 + else: + i += 1 + # Output: + output_folder: str = 'output' + pickle_arb_id_filename: str = 'pickleArbIDs.p' + pickle_j1979_filename: str = 'pickleJ1979.p' + pickle_signal_filename: str = 'pickleSignals.p' + pickle_subset_filename: str = 'pickleSubset.p' + csv_correlation_filename: str = 'subset_correlation_matrix.csv' + pickle_j1979_correlation: str = 'pickleJ1979_correlation.p' + pickle_clusters_filename: str = 'pickleClusters.p' + pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p' + csv_all_signals_filename: str = 'complete_correlation_matrix.csv' + pickle_timer_filename: str = 'pickleTimer.p' -# Change out the normalization strategies as needed. -tang_normalize_strategy: Callable = minmax_scale -signal_normalize_strategy: Callable = minmax_scale + # Change out the normalization strategies as needed. + tang_normalize_strategy: Callable = minmax_scale + signal_normalize_strategy: Callable = minmax_scale -# Turn on or off portions of the pipeline and output methods using these flags. -force_pre_processing: bool = False -force_j1979_plotting: bool = False + # Turn on or off portions of the pipeline and output methods using these flags. + force_pre_processing: bool = False + force_j1979_plotting: bool = True -force_lexical_analysis: bool = False -force_arb_id_plotting: bool = True + force_lexical_analysis: bool = True + force_arb_id_plotting: bool = True -force_semantic_analysis: bool = False -force_signal_labeling: bool = False -use_j1979_tags_in_plots: bool = True -force_cluster_plotting: bool = False + force_semantic_analysis: bool = True + force_signal_labeling: bool = True + use_j1979_tags_in_plots: bool = True + force_cluster_plotting: bool = True -dump_to_pickle: bool = True + dump_to_pickle: bool = True -# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing. -time_conversion = 1000 # convert seconds to milliseconds -z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58} -freq_analysis_accuracy = z_lookup[0.9] -freq_synchronous_threshold = 0.1 + # Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing. + time_conversion = 1000 # convert seconds to milliseconds + z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58} + freq_analysis_accuracy = z_lookup[0.9] + freq_synchronous_threshold = 0.1 -# Threshold parameters used during lexical analysis. -tokenization_bit_distance: float = 0.2 -tokenize_padding: bool = True - -# Threshold parameters used during semantic analysis -subset_selection_size: float = 0.25 -fuzzy_labeling: bool = True -min_correlation_threshold: float = 0.85 - -# A timer class to record timings throughout the pipeline. -a_timer = PipelineTimer(verbose=True) - -# DATA IMPORT AND PRE-PROCESSING # -pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename) -id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer, - tang_normalize_strategy, - time_conversion, - freq_analysis_accuracy, - freq_synchronous_threshold, - force_pre_processing) -if j1979_dictionary: - plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting) + # Threshold parameters used during lexical analysis. Default is 0.2 + tokenization_bit_distance: float = i/100 + tokenize_padding: bool = True -# LEXICAL ANALYSIS # -print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####") -tokenize_dictionary(a_timer, - id_dictionary, - force_lexical_analysis, - include_padding=tokenize_padding, - merge=True, - max_distance=tokenization_bit_distance) -signal_dictionary = generate_signals(a_timer, - id_dictionary, - pickle_signal_filename, - signal_normalize_strategy, - force_lexical_analysis) -plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting) -# SEMANTIC ANALYSIS # -print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####") -subset_df = subset_selection(a_timer, - signal_dictionary, - pickle_subset_filename, - force_semantic_analysis, - subset_size=subset_selection_size) -corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis) -cluster_dict = greedy_signal_clustering(corr_matrix_subset, - correlation_threshold=min_correlation_threshold, - fuzzy_labeling=fuzzy_labeling) -df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer, - pickle_clusters_filename=pickle_clusters_filename, - pickle_all_signals_df_filename=pickle_all_signal_filename, - csv_signals_correlation_filename=csv_all_signals_filename, - signal_dict=signal_dictionary, - cluster_dict=cluster_dict, - correlation_threshold=min_correlation_threshold, - force=force_semantic_analysis) -signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer, - j1979_corr_filename=pickle_j1979_correlation, - df_signals=df_full, - j1979_dict=j1979_dictionary, - signal_dict=signal_dictionary, - correlation_threshold=min_correlation_threshold, - force=force_signal_labeling) -plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting) + # Threshold parameters used during semantic analysis Default is 0.25 and 0.85 + subset_selection_size: float = j/100 + fuzzy_labeling: bool = True + min_correlation_threshold: float = 0.85 -# DATA STORAGE # -if dump_to_pickle: - if force_pre_processing: - if path.isfile(pickle_arb_id_filename): - remove(pickle_arb_id_filename) - if path.isfile(pickle_j1979_filename): - remove(pickle_j1979_filename) - if force_lexical_analysis or force_signal_labeling: - if path.isfile(pickle_signal_filename): - remove(pickle_signal_filename) - if force_semantic_analysis: - if path.isfile(pickle_subset_filename): - remove(pickle_subset_filename) - if path.isfile(csv_correlation_filename): - remove(csv_correlation_filename) - if path.isfile(pickle_j1979_correlation): - remove(pickle_j1979_correlation) - if path.isfile(pickle_clusters_filename): - remove(pickle_clusters_filename) - if path.isfile(pickle_all_signal_filename): - remove(pickle_all_signal_filename) - if path.isfile(csv_all_signals_filename): - remove(csv_all_signals_filename) + # A timer class to record timings throughout the pipeline. + a_timer = PipelineTimer(verbose=True) - timer_flag = 0 - if not path.exists(output_folder): - mkdir(output_folder) - chdir(output_folder) - if not path.isfile(pickle_arb_id_filename): - timer_flag += 1 - print("\nDumping arb ID dictionary to " + pickle_arb_id_filename) - dump(id_dictionary, open(pickle_arb_id_filename, "wb")) - print("\tComplete...") - if not path.isfile(pickle_j1979_filename): - timer_flag += 1 - print("\nDumping J1979 dictionary to " + pickle_j1979_filename) - dump(j1979_dictionary, open(pickle_j1979_filename, "wb")) - print("\tComplete...") - if not path.isfile(pickle_signal_filename): - timer_flag += 1 - print("\nDumping signal dictionary to " + pickle_signal_filename) - dump(signal_dictionary, open(pickle_signal_filename, "wb")) - print("\tComplete...") - if not path.isfile(pickle_subset_filename): - timer_flag += 1 - print("\nDumping signal subset list to " + pickle_subset_filename) - dump(subset_df, open(pickle_subset_filename, "wb")) - print("\tComplete...") - if not path.isfile(csv_correlation_filename): - timer_flag += 1 - print("\nDumping subset correlation matrix to " + csv_correlation_filename) - corr_matrix_subset.to_csv(csv_correlation_filename) - print("\tComplete...") - if not path.isfile(pickle_j1979_correlation): - timer_flag += 1 - print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation) - dump(j1979_correlations, open(pickle_j1979_correlation, "wb")) - print("\tComplete...") - if not path.isfile(pickle_clusters_filename): - timer_flag += 1 - print("\nDumping cluster dictionary to " + pickle_clusters_filename) - dump(cluster_dict, open(pickle_clusters_filename, "wb")) - print("\tComplete...") - if not path.isfile(pickle_all_signal_filename): - timer_flag += 1 - print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename) - dump(df_full, open(pickle_all_signal_filename, "wb")) - print("\tComplete...") - if not path.isfile(csv_all_signals_filename): - timer_flag += 1 - print("\nDumping complete correlation matrix to " + csv_all_signals_filename) - corr_matrix_full.to_csv(csv_all_signals_filename) - print("\tComplete...") - if timer_flag is 9: - print("\nDumping pipeline timer to " + pickle_timer_filename) - dump(a_timer, open(pickle_timer_filename, "wb")) - print("\tComplete...") - chdir("..") + # DATA IMPORT AND PRE-PROCESSING # + pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename) + id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer, + tang_normalize_strategy, + time_conversion, + freq_analysis_accuracy, + freq_synchronous_threshold, + force_pre_processing) + if j1979_dictionary: + plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting) + + + # LEXICAL ANALYSIS # + print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####") + tokenize_dictionary(a_timer, + id_dictionary, + force_lexical_analysis, + include_padding=tokenize_padding, + merge=True, + max_distance=tokenization_bit_distance) + signal_dictionary = generate_signals(a_timer, + id_dictionary, + pickle_signal_filename, + signal_normalize_strategy, + force_lexical_analysis) + plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, i, force_arb_id_plotting) + + # SEMANTIC ANALYSIS # + print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####") + subset_df = subset_selection(a_timer, + signal_dictionary, + pickle_subset_filename, + force_semantic_analysis, + subset_size=subset_selection_size) + corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis) + cluster_dict = greedy_signal_clustering(corr_matrix_subset, + correlation_threshold=min_correlation_threshold, + fuzzy_labeling=fuzzy_labeling) + df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer, + pickle_clusters_filename=pickle_clusters_filename, + pickle_all_signals_df_filename=pickle_all_signal_filename, + csv_signals_correlation_filename=csv_all_signals_filename, + signal_dict=signal_dictionary, + cluster_dict=cluster_dict, + correlation_threshold=min_correlation_threshold, + force=force_semantic_analysis) + signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer, + j1979_corr_filename=pickle_j1979_correlation, + df_signals=df_full, + j1979_dict=j1979_dictionary, + signal_dict=signal_dictionary, + correlation_threshold=min_correlation_threshold, + force=force_signal_labeling) + plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, i, force_cluster_plotting) + + # DATA STORAGE # + if dump_to_pickle: + if force_pre_processing: + if path.isfile(pickle_arb_id_filename): + remove(pickle_arb_id_filename) + if path.isfile(pickle_j1979_filename): + remove(pickle_j1979_filename) + if force_lexical_analysis or force_signal_labeling: + if path.isfile(pickle_signal_filename): + remove(pickle_signal_filename) + if force_semantic_analysis: + if path.isfile(pickle_subset_filename): + remove(pickle_subset_filename) + if path.isfile(csv_correlation_filename): + remove(csv_correlation_filename) + if path.isfile(pickle_j1979_correlation): + remove(pickle_j1979_correlation) + if path.isfile(pickle_clusters_filename): + remove(pickle_clusters_filename) + if path.isfile(pickle_all_signal_filename): + remove(pickle_all_signal_filename) + if path.isfile(csv_all_signals_filename): + remove(csv_all_signals_filename) + + timer_flag = 0 + if not path.exists(output_folder): + mkdir(output_folder) + chdir(output_folder) + if not path.isfile(pickle_arb_id_filename): + timer_flag += 1 + print("\nDumping arb ID dictionary to " + pickle_arb_id_filename) + dump(id_dictionary, open(pickle_arb_id_filename, "wb")) + print("\tComplete...") + if not path.isfile(pickle_j1979_filename): + timer_flag += 1 + print("\nDumping J1979 dictionary to " + pickle_j1979_filename) + dump(j1979_dictionary, open(pickle_j1979_filename, "wb")) + print("\tComplete...") + if not path.isfile(pickle_signal_filename): + timer_flag += 1 + print("\nDumping signal dictionary to " + pickle_signal_filename) + dump(signal_dictionary, open(pickle_signal_filename, "wb")) + print("\tComplete...") + if not path.isfile(pickle_subset_filename): + timer_flag += 1 + print("\nDumping signal subset list to " + pickle_subset_filename) + dump(subset_df, open(pickle_subset_filename, "wb")) + print("\tComplete...") + if not path.isfile(csv_correlation_filename): + timer_flag += 1 + print("\nDumping subset correlation matrix to " + csv_correlation_filename) + corr_matrix_subset.to_csv(csv_correlation_filename) + print("\tComplete...") + if not path.isfile(pickle_j1979_correlation): + timer_flag += 1 + print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation) + dump(j1979_correlations, open(pickle_j1979_correlation, "wb")) + print("\tComplete...") + if not path.isfile(pickle_clusters_filename): + timer_flag += 1 + print("\nDumping cluster dictionary to " + pickle_clusters_filename) + dump(cluster_dict, open(pickle_clusters_filename, "wb")) + print("\tComplete...") + if not path.isfile(pickle_all_signal_filename): + timer_flag += 1 + print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename) + dump(df_full, open(pickle_all_signal_filename, "wb")) + print("\tComplete...") + if not path.isfile(csv_all_signals_filename): + timer_flag += 1 + print("\nDumping complete correlation matrix to " + csv_all_signals_filename) + corr_matrix_full.to_csv(csv_all_signals_filename) + print("\tComplete...") + if timer_flag is 9: + print("\nDumping pipeline timer to " + pickle_timer_filename) + dump(a_timer, open(pickle_timer_filename, "wb")) + print("\tComplete...") + chdir("..") diff --git a/Pipeline/PipelineTimer.py b/Pipeline/PipelineTimer.py old mode 100644 new mode 100755 diff --git a/Pipeline/Plotter.py b/Pipeline/Plotter.py old mode 100644 new mode 100755 index ff2c919..2ca4a98 --- a/Pipeline/Plotter.py +++ b/Pipeline/Plotter.py @@ -16,7 +16,10 @@ cluster_folder: str = 'clusters' j1979_folder: str = 'j1979' -def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False): +def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, settings: int, force: bool = False): + arb_id_folder = 'figures' + str(settings) + + if path.exists(arb_id_folder): if force: rmtree(arb_id_folder) @@ -29,7 +32,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic for k_id, signals in signal_dict.items(): arb_id = arb_id_dict[k_id] if not arb_id.static: - print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")") + print(str(settings) + "Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")") a_timer.start_iteration_time() signals_to_plot = [] @@ -99,7 +102,9 @@ def plot_signals_by_cluster(a_timer: PipelineTimer, cluster_dict: dict, signal_dict: dict, use_j1979_tags: bool, + settings: int, force: bool=False): + cluster_folder = 'cluster' + str(settings) if path.exists(cluster_folder): if force: rmtree(cluster_folder) diff --git a/Pipeline/PreProcessor.py b/Pipeline/PreProcessor.py old mode 100644 new mode 100755 index 044e62e..04aa420 --- a/Pipeline/PreProcessor.py +++ b/Pipeline/PreProcessor.py @@ -44,7 +44,7 @@ class PreProcessor: header=None, names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'], skiprows=7, - delimiter='\t', + delimiter=' ', converters=convert_dict, index_col=0) diff --git a/Pipeline/SemanticAnalysis.py b/Pipeline/SemanticAnalysis.py old mode 100644 new mode 100755 diff --git a/Pipeline/Signal.py b/Pipeline/Signal.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/ArbID.py b/Pipeline_multi-file/ArbID.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/FileBoi.py b/Pipeline_multi-file/FileBoi.py old mode 100644 new mode 100755 index 3a92498..70f0a44 --- a/Pipeline_multi-file/FileBoi.py +++ b/Pipeline_multi-file/FileBoi.py @@ -60,13 +60,14 @@ class FileBoi: # Check if this file name matches the expected name for a CAN data sample. If so, create new Sample m = re.match('loggerProgram[\d]+.log', file) if m: + i = 0 if not (make, model, year) in sample_dict: sample_dict[(make, model, year)] = [] this_sample_index = str(len(sample_dict[(make, model, year)])) this_sample = Sample(make=make, model=model, year=year, sample_index=this_sample_index, - sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n) + sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n) sample_dict[(make, model, year)].append(this_sample) - current_vehicle = [] + current_vehicle = [] else: if this_dir == "Captures": continue diff --git a/Pipeline_multi-file/J1979.py b/Pipeline_multi-file/J1979.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/KnownSignalAnalysis.py b/Pipeline_multi-file/KnownSignalAnalysis.py new file mode 100755 index 0000000..7f09af1 --- /dev/null +++ b/Pipeline_multi-file/KnownSignalAnalysis.py @@ -0,0 +1,130 @@ +from numpy import float64, nditer, uint64, zeros, ndarray, inf +from pandas import Series, DataFrame +from os import path, remove +from pickle import load +from ArbID import ArbID +from Signal import Signal +from PipelineTimer import PipelineTimer +from typing import List +from scipy import integrate + + +def transform_signal(a_timer: PipelineTimer, + arb_id_dict: dict, + signal_dict: dict, + transform_pickle_filename: str, + normalize_strategy, + given_arb_id: int, + force=False): + if force and path.isfile(transform_pickle_filename): + remove(transform_pickle_filename) + if path.isfile(transform_pickle_filename): + print("\nSignal transformation already completed and forcing is turned off. Using pickled data...") + return load(open(transform_pickle_filename, "rb")) + + a_timer.start_function_time() + + transform_dict = signal_dict + + # arb_id_dict[given_arb_id * 256] = ArbID(given_arb_id * 256) + + for k, arb_id in arb_id_dict.items(): + # print(str(arb_id.id) + " == " + str(given_arb_id) + " ?\n") + if arb_id.id == given_arb_id: + arb_id.static = False + arb_id.short = False + if not arb_id.static: + for token in arb_id.tokenization: + a_timer.start_iteration_time() + + signal = Signal(k * 256, token[0], token[1]) + signal.static = False + + + + # Convert the binary ndarray to a list of string representations of each row + temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]] + temp2 = zeros((temp1.__len__()+1), dtype=uint64) + # convert each string representation to int + for i, row in enumerate(temp1): + temp2[i] = int(row, 2) + + temp3 = integrate.cumtrapz(temp2) + print("Arb Id " + str(k) + ", Signal from " + str(token[0]) + " to " + str(token[1]) + " Integrated successfully") + + + + # create an unsigned integer pandas.Series using the time index from this Arb ID's original data. + signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64) + + + + # Normalize the signal and update its meta-data + signal.normalize_and_set_metadata(normalize_strategy) + # add this signal to the signal dictionary which is keyed by Arbitration ID + if (k * 256) in transform_dict: + transform_dict[k * 256][(arb_id.id * 256, signal.start_index, signal.stop_index)] = signal + else: + print("Successfully added at transform dict") + transform_dict[k * 256] = {(arb_id.id * 256, signal.start_index, signal.stop_index): signal} + + a_timer.set_token_to_signal() + + a_timer.set_signal_generation() + + return transform_dict + + +def transform_signals(a_timer: PipelineTimer, + arb_id_dict: dict, + transform_pickle_filename: str, + normalize_strategy, + force=False): + if force and path.isfile(transform_pickle_filename): + remove(transform_pickle_filename) + if path.isfile(transform_pickle_filename): + print("\nSignal transformation already completed and forcing is turned off. Using pickled data...") + return load(open(transform_pickle_filename, "rb")) + + a_timer.start_function_time() + + transform_dict = {} # arb_id_dict + + for k, arb_id in arb_id_dict.items(): + if not arb_id.static: + for token in arb_id.tokenization: + a_timer.start_iteration_time() + + signal = Signal(k * 256, token[0], token[1]) + + + + # Convert the binary ndarray to a list of string representations of each row + temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]] + temp2 = zeros((temp1.__len__()+1), dtype=uint64) + # convert each string representation to int + for i, row in enumerate(temp1): + temp2[i] = int(row, 2) + + temp3 = integrate.cumtrapz(temp2) + + + + # create an unsigned integer pandas.Series using the time index from this Arb ID's original data. + signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64) + + + + # Normalize the signal and update its meta-data + signal.normalize_and_set_metadata(normalize_strategy) + # add this signal to the signal dictionary which is keyed by Arbitration ID + if k in transform_dict: + transform_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal + else: + transform_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal} + + a_timer.set_token_to_signal() + + a_timer.set_signal_generation() + + return transform_dict diff --git a/Pipeline_multi-file/LexicalAnalysis.py b/Pipeline_multi-file/LexicalAnalysis.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/Main.py b/Pipeline_multi-file/Main.py old mode 100644 new mode 100755 index 0f1d030..aeeb00a --- a/Pipeline_multi-file/Main.py +++ b/Pipeline_multi-file/Main.py @@ -5,14 +5,22 @@ from Sample import Sample # Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING? kfold_n: int = 5 current_vehicle_number = 0 +known_speed_arb_id = 514 good_boi = FileBoi() samples = good_boi.go_fetch(kfold_n) for key, sample_list in samples.items(): # type: tuple, list for sample in sample_list: # type: Sample - print(current_vehicle_number) + + # sample.tang_inversion_bit_dist += (0.01 * current_vehicle_number) + # sample.max_inter_cluster_dist += (0.01 * current_vehicle_number) + # sample.tang_inversion_bit_dist = round(sample.tang_inversion_bit_dist, 2) # removes floating point errors + # sample.max_inter_cluster_dist = round(sample.max_inter_cluster_dist, 2) + # print("\n\t##### Settings are " + str(sample.tang_inversion_bit_dist) + " and " + str( + # sample.max_inter_cluster_dist) + " #####") + print("\nData import and Pre-Processing for " + sample.output_vehicle_dir) - id_dict, j1979_dict = sample.pre_process() + id_dict, j1979_dict = sample.pre_process(known_speed_arb_id) if j1979_dict: sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number)) @@ -25,14 +33,22 @@ for key, sample_list in samples.items(): # type: tuple, list print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####") sample.tokenize_dictionary(id_dict) signal_dict = sample.generate_signals(id_dict, bool(j1979_dict)) - sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number)) + # sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number)) - # LEXICAL ANALYSIS # + # KNOWN SIGNAL ANALYSIS # + print("\n\t##### BEGINNING KNOWN SIGNAL ANALYSIS OF " + sample.output_vehicle_dir + " #####") + transform_dict= sample.transform_signal(id_dict, signal_dict, known_speed_arb_id) + sample.plot_arb_ids(id_dict, transform_dict, vehicle_number=str(current_vehicle_number)) + + + # SEMANTIC ANALYSIS # print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####") - corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict) + corr_matrix, combined_df = sample.generate_correlation_matrix(transform_dict) if j1979_dict: - signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df) + transform_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, transform_dict, combined_df) cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix) - sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number)) + # sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number)) + sample.plot_known_signal_cluster(cluster_dict, signal_dict, bool(j1979_dict), known_speed_arb_id, vehicle_number=str(current_vehicle_number)) sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number)) current_vehicle_number += 1 + diff --git a/Pipeline_multi-file/PipelineTimer.py b/Pipeline_multi-file/PipelineTimer.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/Plotter.py b/Pipeline_multi-file/Plotter.py old mode 100644 new mode 100755 index 186e6ff..c9610ca --- a/Pipeline_multi-file/Plotter.py +++ b/Pipeline_multi-file/Plotter.py @@ -25,13 +25,13 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic rmtree(arb_id_folder) else: print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...") - return + # return a_timer.start_function_time() for k_id, signals in signal_dict.items(): arb_id = arb_id_dict[k_id] - if not arb_id.static and not arb_id.short: + if (not arb_id.static and not arb_id.short) or k_id == 155136: print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number) a_timer.start_iteration_time() @@ -85,7 +85,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic chdir(arb_id_folder) # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly. - savefig(hex(arb_id.id) + "." + figure_format, + savefig(hex(signal.arb_id) + "." + figure_format, bbox_iches='tight', pad_inches=0.0, dpi=figure_dpi, @@ -311,3 +311,162 @@ def plot_dendrogram(a_timer: PipelineTimer, transparent=figure_transp) plt.close() print("\t\tComplete...") + + +def plot_known_signal_cluster(a_timer: PipelineTimer, + cluster_dict: dict, + signal_dict: dict, + use_j1979_tags: bool, + vehicle_number: str, + given_arb_id: int, + force: bool = False): + if path.exists(cluster_folder): + if force: + rmtree(cluster_folder) + else: + print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...") + return + + a_timer.start_function_time() + + print("\n") + for cluster_number, list_of_signals in cluster_dict.items(): + if [v for i, v in enumerate(list_of_signals) if (v[0] == given_arb_id or v[0] == given_arb_id * 256)]: + print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.") + a_timer.start_iteration_time() + + # Setup the plot + fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False) + plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number, + weight='bold', + position=(0.5, 1)) + fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3) + + size_adjust = len(list_of_signals) / 100 + # The min() statement provides whitespace for the suptitle depending on the number of subplots. + plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust))) + # This adjusts whitespace padding on the left and right of the subplots + fig.subplots_adjust(left=0.07, right=0.98) + + # Plot the time series of each signal in the cluster + for i, signal_key in enumerate(list_of_signals): + signal = signal_dict[signal_key[0]][signal_key] + ax = axes[i, 0] + if signal.j1979_title and use_j1979_tags: + this_title = signal.plot_title + " [" + signal.j1979_title + \ + " (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]" + else: + this_title = signal.plot_title + ax.set_title(this_title, + style='italic', + size='medium') + ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()]) + ax.plot(signal.time_series, color='black') + + if not path.exists(cluster_folder): + mkdir(cluster_folder) + chdir(cluster_folder) + + # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly. + if len(list_of_signals) < 100: # prevents errors when given too low a setting for correlation + savefig("cluster_" + str(cluster_number) + "." + figure_format, + bbox_iches='tight', + pad_inches=0.0, + dpi=figure_dpi, + format=figure_format, + transparent=figure_transp) + else: + print("Too many clusters to plot! Skipping...") + + chdir("..") + + plt.close(fig) + + a_timer.set_plot_save_cluster() + print("\tComplete...") + + a_timer.set_plot_save_cluster_dict() + + +def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str, + force: bool=False): + if path.exists(arb_id_folder): + if force: + rmtree(arb_id_folder) + else: + print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...") + # return + + a_timer.start_function_time() + + for k_id, signals in signal_dict.items(): + arb_id = arb_id_dict[k_id] + if (not arb_id.static and not arb_id.short) or k_id == 155136: + print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number) + a_timer.start_iteration_time() + + signals_to_plot = [] + # Don't plot the static signals + for k_signal, signal in signals.items(): + if not signal.static: + signals_to_plot.append(signal) + # There's a corner case where the Arb ID only has static signals. This conditional accounts for this. + # TODO: This corner case should probably be reflected by arb_id.static. + if len(signals_to_plot) < 1: + continue + # One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors. + fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1) + plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number, + weight='bold', + position=(0.5, 1)) + fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3) + # The min() statement provides whitespace for the title depending on the number of subplots. + size_adjust = len(signals_to_plot) / 100 + plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust))) + # This adjusts whitespace padding on the left and right of the subplots + fig.subplots_adjust(left=0.07, right=0.98) + for i, signal in enumerate(signals_to_plot): + ax = axes[i] + ax.set_title(signal.plot_title, + style='italic', + size='medium') + ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()]) + ax.plot(signal.time_series, color='black') + # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow + axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed') + + # Plot the entropy gradient at the bottom of the overall output + ax = axes[-1] + ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)", + style='italic', + size='medium') + tang_bit_width = arb_id.tang.shape[0] + ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width]) + y = arb_id.tang[:] + # Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively. + ix = isin(y, 0) + pad_bit = where(ix) + non_pad_bit = where(~ix) + ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10) + ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10) + + if not path.exists(arb_id_folder): + mkdir(arb_id_folder) + chdir(arb_id_folder) + + # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly. + savefig(hex(signal.arb_id) + "." + figure_format, + bbox_iches='tight', + pad_inches=0.0, + dpi=figure_dpi, + format=figure_format, + transparent=figure_transp) + + chdir("..") + + plt.close(fig) + + a_timer.set_plot_save_arb_id() + print("\tComplete...") + + a_timer.set_plot_save_arb_id_dict() \ No newline at end of file diff --git a/Pipeline_multi-file/PreProcessor.py b/Pipeline_multi-file/PreProcessor.py old mode 100644 new mode 100755 index de1ee20..2819686 --- a/Pipeline_multi-file/PreProcessor.py +++ b/Pipeline_multi-file/PreProcessor.py @@ -1,4 +1,4 @@ -from pandas import DataFrame, read_csv, Series +from pandas import DataFrame, read_csv, Series, concat from numpy import int64 from os import path, remove, getcwd from pickle import load @@ -45,7 +45,7 @@ class PreProcessor: header=None, names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'], skiprows=7, - delimiter='\t', + delimiter=' ', converters=convert_dict, index_col=0) @@ -70,6 +70,7 @@ class PreProcessor: time_conversion: int = 1000, freq_analysis_accuracy: float = 0.0, freq_synchronous_threshold: float = 0.0, + given_arb_id: int = 0, force: bool = False) -> (dict, dict): id_dictionary = {} j1979_dictionary = {} @@ -92,6 +93,11 @@ class PreProcessor: return id_dictionary, j1979_dictionary else: self.import_csv(a_timer, self.data_filename) + this_id = self.data.loc[self.data['id'] == given_arb_id].copy() + this_id.id = given_arb_id * 256 + + combined = concat([self.data, this_id]) + self.data = combined a_timer.start_function_time() diff --git a/Pipeline_multi-file/Sample.py b/Pipeline_multi-file/Sample.py old mode 100644 new mode 100755 index 488bf1e..39d9ce9 --- a/Pipeline_multi-file/Sample.py +++ b/Pipeline_multi-file/Sample.py @@ -2,7 +2,7 @@ from PreProcessor import PreProcessor from Validator import Validator from LexicalAnalysis import tokenize_dictionary, generate_signals from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling -from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram +from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram, plot_known_signal_cluster from sklearn.preprocessing import minmax_scale from typing import Callable from PipelineTimer import PipelineTimer @@ -11,6 +11,8 @@ from pickle import dump, load from numpy import ndarray, zeros, float16 from pandas import DataFrame +from KnownSignalAnalysis import transform_signals, transform_signal + # File names for the on-disc data input and output. output_folder: str = 'output' pickle_arb_id_filename: str = 'pickleArbIDs.p' @@ -26,6 +28,8 @@ pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p' csv_all_signals_filename: str = 'complete_correlation_matrix.csv' pickle_timer_filename: str = 'pickleTimer.p' +pickle_transform_filename: str = 'pickleTransform' + dump_to_pickle: bool = True # Change out the normalization strategies as needed. @@ -39,9 +43,11 @@ force_threshold_plotting: bool = False force_j1979_plotting: bool = True use_j1979: bool = True +force_transform: bool = False + force_lexical_analysis: bool = False force_signal_generation: bool = False -force_arb_id_plotting: bool = True +force_arb_id_plotting: bool = False force_correlation_matrix: bool = False force_clustering: bool = False @@ -58,16 +64,15 @@ freq_synchronous_threshold = 0.1 # Threshold parameters used during lexical analysis. tokenization_bit_distance: float = 0.2 -tokenize_padding: bool = True +tokenize_padding: bool = False # changing this to false seems to help better find weak signals merge_tokens: bool = True # Threshold parameters used during semantic analysis subset_selection_size: float = 0.25 -max_intra_cluster_distance: float = 0.20 +max_intra_cluster_distance: float = 0.10 # normally 0.25 min_j1979_correlation: float = 0.85 # fuzzy_labeling: bool = True - # A timer class to record timings throughout the pipeline. a_timer = PipelineTimer(verbose=True) @@ -112,7 +117,7 @@ class Sample: # Move back to root of './output/make_model_year/sample_index/" chdir("../../../") - def pre_process(self): + def pre_process(self, given_arb_id): self.make_and_move_to_vehicle_directory() pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979) id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer, @@ -120,6 +125,7 @@ class Sample: time_conversion, freq_analysis_accuracy, freq_synchronous_threshold, + given_arb_id, force_pre_processing) if dump_to_pickle: if force_pre_processing: @@ -303,3 +309,37 @@ class Sample: plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist, vehicle_number=vehicle_number, force=force_dendrogram_plotting) self.move_back_to_parent_directory() + + def transform_signals(self, id_dictionary: dict): + self.make_and_move_to_vehicle_directory() + transform_dict = transform_signals(a_timer=a_timer, + arb_id_dict=id_dictionary, + transform_pickle_filename=pickle_transform_filename, + normalize_strategy=signal_normalize_strategy, + force=force_transform) + self.move_back_to_parent_directory() + return transform_dict + + def transform_signal(self, id_dictionary: dict, signal_dict: dict, arb_id: int): + self.make_and_move_to_vehicle_directory() + transform_dict = transform_signal(a_timer=a_timer, + arb_id_dict=id_dictionary, + signal_dict=signal_dict, + transform_pickle_filename=pickle_transform_filename, + normalize_strategy=signal_normalize_strategy, + given_arb_id=arb_id, + force=force_transform) + self.move_back_to_parent_directory() + return transform_dict + + def plot_known_signal_cluster(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool, + known_signal: int, vehicle_number: str): + self.make_and_move_to_vehicle_directory() + plot_known_signal_cluster(a_timer=a_timer, + cluster_dict=cluster_dictionary, + signal_dict=signal_dictionary, + use_j1979_tags=use_j1979_tags, + vehicle_number=vehicle_number, + given_arb_id=known_signal, + force=force_cluster_plotting) + self.move_back_to_parent_directory() \ No newline at end of file diff --git a/Pipeline_multi-file/SemanticAnalysis.py b/Pipeline_multi-file/SemanticAnalysis.py old mode 100644 new mode 100755 index b4efc2e..707cc83 --- a/Pipeline_multi-file/SemanticAnalysis.py +++ b/Pipeline_multi-file/SemanticAnalysis.py @@ -1,5 +1,5 @@ from pandas import concat, DataFrame, read_csv -from numpy import ndarray, zeros +from numpy import ndarray, zeros, clip from os import path, remove from pickle import load, dump from ast import literal_eval @@ -77,7 +77,7 @@ def signal_clustering(corr_matrix: DataFrame, corr_matrix.where(corr_matrix > 0, 0, inplace=True) corr_matrix = 1 - corr_matrix X = corr_matrix.values # type: ndarray - Y = ssd.squareform(X) + Y = clip(ssd.squareform(X), 0, None) # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method Z = linkage(Y, method='single', optimal_ordering=True) fclus = fcluster(Z, t=threshold, criterion='distance') diff --git a/Pipeline_multi-file/Signal.py b/Pipeline_multi-file/Signal.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/Validator.py b/Pipeline_multi-file/Validator.py old mode 100644 new mode 100755 diff --git a/Pipeline_multi-file/maximize_sum_shannon.py b/Pipeline_multi-file/maximize_sum_shannon.py old mode 100644 new mode 100755