diff --git a/Pipeline/.gitignore b/Pipeline/.gitignore new file mode 100644 index 0000000..4c3a695 --- /dev/null +++ b/Pipeline/.gitignore @@ -0,0 +1,4 @@ +clusters/ +figures/ +output/ + diff --git a/Pipeline/FromCanUtilsLog.py b/Pipeline/FromCanUtilsLog.py new file mode 100644 index 0000000..cda5579 --- /dev/null +++ b/Pipeline/FromCanUtilsLog.py @@ -0,0 +1,31 @@ +import re + + +def canUtilsToTSV(filename): + outFileName = filename + ".tsv" + with open(outFileName, "w") as outFile: + with open(filename, "r") as file: + linePattern = re.compile( + r"\((\d+.\d+)\)\s+[^\s]+\s+(.{3})#([0-9A-F]+)") + + while True: + line = file.readline() + if not line: + return outFileName + tokens = linePattern.search(line).groups() + + # write delta time + writeLine = tokens[0] + + # write arb id + writeLine += '\t' + tokens[1] + + # write dlc + bytes = int(len(tokens[2]) / 2) + writeLine += '\t' + str(bytes) + + # write bytes + for b in range(bytes): + writeLine += '\t' + tokens[2][b*2:b*2+2] + + outFile.write(writeLine + '\n') diff --git a/Pipeline/Main.py b/Pipeline/Main.py index b2f8d4c..535e916 100644 --- a/Pipeline/Main.py +++ b/Pipeline/Main.py @@ -1,3 +1,4 @@ +import argparse from os import chdir, mkdir, path, remove from pickle import dump from sklearn.preprocessing import minmax_scale @@ -8,11 +9,26 @@ from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal j1979_signal_labeling from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster from PipelineTimer import PipelineTimer +from FromCanUtilsLog import canUtilsToTSV # File names for the on-disc data input and output. # Input: -can_data_filename: str = 'drive_runway_afit.log' -# can_data_filename: str = 'loggerProgram0.log' + +# get filename from argument parser +parser = argparse.ArgumentParser() +parser.add_argument("filename", nargs='*', type=str, + help="filename of CAN log file") +parser.add_argument( + "-c", "--can-utils", help="read file in Linux can-utils format", action="store_true") + +args = parser.parse_args() + +# degault to "loggerProgram0.log" if no filename specified by args +can_data_filename = args.filename[0] if args.filename else "loggerProgram0.log" + +if (args.can_utils): + # run converter to convert to TSV before continuing + can_data_filename = canUtilsToTSV(can_data_filename) # Output: output_folder: str = 'output' @@ -64,7 +80,8 @@ min_correlation_threshold: float = 0.85 a_timer = PipelineTimer(verbose=True) # DATA IMPORT AND PRE-PROCESSING # -pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename) +pre_processor = PreProcessor( + can_data_filename, pickle_arb_id_filename, pickle_j1979_filename) id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer, tang_normalize_strategy, time_conversion, @@ -88,7 +105,8 @@ signal_dictionary = generate_signals(a_timer, pickle_signal_filename, signal_normalize_strategy, force_lexical_analysis) -plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting) +plot_signals_by_arb_id(a_timer, id_dictionary, + signal_dictionary, force_arb_id_plotting) # SEMANTIC ANALYSIS # print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####") @@ -97,7 +115,8 @@ subset_df = subset_selection(a_timer, pickle_subset_filename, force_semantic_analysis, subset_size=subset_selection_size) -corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis) +corr_matrix_subset = subset_correlation( + subset_df, csv_correlation_filename, force_semantic_analysis) cluster_dict = greedy_signal_clustering(corr_matrix_subset, correlation_threshold=min_correlation_threshold, fuzzy_labeling=fuzzy_labeling) @@ -116,7 +135,8 @@ signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer, signal_dict=signal_dictionary, correlation_threshold=min_correlation_threshold, force=force_signal_labeling) -plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting) +plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, + use_j1979_tags_in_plots, force_cluster_plotting) # DATA STORAGE # if dump_to_pickle: @@ -173,7 +193,8 @@ if dump_to_pickle: print("\tComplete...") if not path.isfile(pickle_j1979_correlation): timer_flag += 1 - print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation) + print("\nDumping J1979 correlation DataFrame to " + + pickle_j1979_correlation) dump(j1979_correlations, open(pickle_j1979_correlation, "wb")) print("\tComplete...") if not path.isfile(pickle_clusters_filename): @@ -183,15 +204,17 @@ if dump_to_pickle: print("\tComplete...") if not path.isfile(pickle_all_signal_filename): timer_flag += 1 - print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename) + print("\nDumping complete signals DataFrame to " + + pickle_all_signal_filename) dump(df_full, open(pickle_all_signal_filename, "wb")) print("\tComplete...") if not path.isfile(csv_all_signals_filename): timer_flag += 1 - print("\nDumping complete correlation matrix to " + csv_all_signals_filename) + print("\nDumping complete correlation matrix to " + + csv_all_signals_filename) corr_matrix_full.to_csv(csv_all_signals_filename) print("\tComplete...") - if timer_flag is 9: + if timer_flag == 9: print("\nDumping pipeline timer to " + pickle_timer_filename) dump(a_timer, open(pickle_timer_filename, "wb")) print("\tComplete...") diff --git a/Pipeline/PreProcessor.py b/Pipeline/PreProcessor.py index 044e62e..8a9b765 100644 --- a/Pipeline/PreProcessor.py +++ b/Pipeline/PreProcessor.py @@ -42,12 +42,15 @@ class PreProcessor: self.data = read_csv(filename, header=None, - names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'], + names=['time', 'id', 'dlc', 'b0', 'b1', + 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'], skiprows=7, - delimiter='\t', + delim_whitespace=True, converters=convert_dict, index_col=0) + print(self.data) + a_timer.set_can_csv_to_df() # sanity check output of the original data @@ -95,11 +98,13 @@ class PreProcessor: continue elif arb_id == 2024: # This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024) - j1979_data = self.data.loc[self.data['id'] == arb_id].copy() + j1979_data = self.data.loc[self.data['id'] == arb_id].copy( + ) j1979_data.drop('dlc', axis=1, inplace=True) j1979_data.drop('id', axis=1, inplace=True) a_timer.start_nested_function_time() - j1979_dictionary = self.generate_j1979_dictionary(j1979_data) + j1979_dictionary = self.generate_j1979_dictionary( + j1979_data) a_timer.set_j1979_creation() elif arb_id > 0: a_timer.start_iteration_time() @@ -110,7 +115,7 @@ class PreProcessor: # Check if the Arbitration ID always used the same DLC. If not, ignore it. # We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary. - if this_id.original_data['dlc'].nunique() is not 1: + if this_id.original_data['dlc'].nunique() != 1: continue this_id.dlc = this_id.original_data['dlc'].iloc[0] this_id.original_data.drop('dlc', axis=1, inplace=True) @@ -121,14 +126,16 @@ class PreProcessor: # not actually on the bus. if this_id.dlc < 8: for i in range(this_id.dlc, 8): - this_id.original_data.drop('b' + str(i), axis=1, inplace=True) + this_id.original_data.drop( + 'b' + str(i), axis=1, inplace=True) # Check if there are duplicate index values and correct them. if not this_id.original_data.index.is_unique: correction_mask = this_id.original_data.index.duplicated() this_id.original_data = this_id.original_data[~correction_mask] - this_id.generate_binary_matrix_and_tang(a_timer, normalize_strategy) + this_id.generate_binary_matrix_and_tang( + a_timer, normalize_strategy) this_id.analyze_transmission_frequency(time_convert=time_conversion, ci_accuracy=freq_analysis_accuracy, synchronous_threshold=freq_synchronous_threshold)