Moved my changes to a new branch. This is the original code.

2019-08-27 19:25:47 -04:00 · 2019-08-27 19:25:47 -04:00 · 5555c6371f
parent 71a2912afb
commit 5555c6371f
24 changed files with 265 additions and 585 deletions
--- a/Pipeline/ArbID.py
+++ b/Pipeline/ArbID.py
--- a/Pipeline/J1979.py
+++ b/Pipeline/J1979.py
--- a/Pipeline/LexicalAnalysis.py
+++ b/Pipeline/LexicalAnalysis.py
--- a/Pipeline/Main.py
+++ b/Pipeline/Main.py
@ -9,202 +9,190 @@ from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal
 from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster
 from PipelineTimer import PipelineTimer

-i = 0
-j = 0
 # File names for the on-disc data input and output.
 # Input:
-#can_data_filename:          str = 'drive_runway_afit.log'
-can_data_filename:          str = 'loggerProgram0.log'
+can_data_filename:          str = 'drive_runway_afit.log'
+# can_data_filename:          str = 'loggerProgram0.log'

-while i < 51:
-    if i == 50 and j < 50: #i need to optimize this and redesign it
-        j += 1
-        i = 0
-    elif i == 50 and j == 50:
-        i = 51
-    else:
-        i += 1
-    # Output:
-    output_folder:              str = 'output'
-    pickle_arb_id_filename:     str = 'pickleArbIDs.p'
-    pickle_j1979_filename:      str = 'pickleJ1979.p'
-    pickle_signal_filename:     str = 'pickleSignals.p'
-    pickle_subset_filename:     str = 'pickleSubset.p'
-    csv_correlation_filename:   str = 'subset_correlation_matrix.csv'
-    pickle_j1979_correlation:   str = 'pickleJ1979_correlation.p'
-    pickle_clusters_filename:   str = 'pickleClusters.p'
-    pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
-    csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
-    pickle_timer_filename:      str = 'pickleTimer.p'
+# Output:
+output_folder:              str = 'output'
+pickle_arb_id_filename:     str = 'pickleArbIDs.p'
+pickle_j1979_filename:      str = 'pickleJ1979.p'
+pickle_signal_filename:     str = 'pickleSignals.p'
+pickle_subset_filename:     str = 'pickleSubset.p'
+csv_correlation_filename:   str = 'subset_correlation_matrix.csv'
+pickle_j1979_correlation:   str = 'pickleJ1979_correlation.p'
+pickle_clusters_filename:   str = 'pickleClusters.p'
+pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
+csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
+pickle_timer_filename:      str = 'pickleTimer.p'

-    # Change out the normalization strategies as needed.
-    tang_normalize_strategy:    Callable = minmax_scale
-    signal_normalize_strategy:  Callable = minmax_scale
+# Change out the normalization strategies as needed.
+tang_normalize_strategy:    Callable = minmax_scale
+signal_normalize_strategy:  Callable = minmax_scale

-    # Turn on or off portions of the pipeline and output methods using these flags.
-    force_pre_processing:       bool = False
-    force_j1979_plotting:       bool = True
+# Turn on or off portions of the pipeline and output methods using these flags.
+force_pre_processing:       bool = False
+force_j1979_plotting:       bool = False

-    force_lexical_analysis:     bool = True
-    force_arb_id_plotting:      bool = True
+force_lexical_analysis:     bool = False
+force_arb_id_plotting:      bool = True

-    force_semantic_analysis:    bool = True
-    force_signal_labeling:      bool = True
-    use_j1979_tags_in_plots:    bool = True
-    force_cluster_plotting:     bool = True
+force_semantic_analysis:    bool = False
+force_signal_labeling:      bool = False
+use_j1979_tags_in_plots:    bool = True
+force_cluster_plotting:     bool = False

-    dump_to_pickle:             bool = True
+dump_to_pickle:             bool = True

-    # Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
-    time_conversion = 1000  # convert seconds to milliseconds
-    z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
-    freq_analysis_accuracy = z_lookup[0.9]
-    freq_synchronous_threshold = 0.1
+# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
+time_conversion = 1000  # convert seconds to milliseconds
+z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
+freq_analysis_accuracy = z_lookup[0.9]
+freq_synchronous_threshold = 0.1

-    # Threshold parameters used during lexical analysis. Default is 0.2
-    tokenization_bit_distance:  float = i/100
-    tokenize_padding:           bool = True
+# Threshold parameters used during lexical analysis.
+tokenization_bit_distance:  float = 0.2
+tokenize_padding:           bool = True
+
+# Threshold parameters used during semantic analysis
+subset_selection_size:      float = 0.25
+fuzzy_labeling:             bool = True
+min_correlation_threshold:  float = 0.85
+
+# A timer class to record timings throughout the pipeline.
+a_timer = PipelineTimer(verbose=True)
+
+#            DATA IMPORT AND PRE-PROCESSING             #
+pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
+id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
+                                                                           tang_normalize_strategy,
+                                                                           time_conversion,
+                                                                           freq_analysis_accuracy,
+                                                                           freq_synchronous_threshold,
+                                                                           force_pre_processing)
+if j1979_dictionary:
+    plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)


+#                 LEXICAL ANALYSIS                     #
+print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
+tokenize_dictionary(a_timer,
+                    id_dictionary,
+                    force_lexical_analysis,
+                    include_padding=tokenize_padding,
+                    merge=True,
+                    max_distance=tokenization_bit_distance)
+signal_dictionary = generate_signals(a_timer,
+                                     id_dictionary,
+                                     pickle_signal_filename,
+                                     signal_normalize_strategy,
+                                     force_lexical_analysis)
+plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting)

-    # Threshold parameters used during semantic analysis Default is 0.25 and 0.85
-    subset_selection_size:      float = j/100
-    fuzzy_labeling:             bool = True
-    min_correlation_threshold:  float = 0.85
+#                  SEMANTIC ANALYSIS                    #
+print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
+subset_df = subset_selection(a_timer,
+                             signal_dictionary,
+                             pickle_subset_filename,
+                             force_semantic_analysis,
+                             subset_size=subset_selection_size)
+corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
+cluster_dict = greedy_signal_clustering(corr_matrix_subset,
+                                        correlation_threshold=min_correlation_threshold,
+                                        fuzzy_labeling=fuzzy_labeling)
+df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
+                                                            pickle_clusters_filename=pickle_clusters_filename,
+                                                            pickle_all_signals_df_filename=pickle_all_signal_filename,
+                                                            csv_signals_correlation_filename=csv_all_signals_filename,
+                                                            signal_dict=signal_dictionary,
+                                                            cluster_dict=cluster_dict,
+                                                            correlation_threshold=min_correlation_threshold,
+                                                            force=force_semantic_analysis)
+signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
+                                                              j1979_corr_filename=pickle_j1979_correlation,
+                                                              df_signals=df_full,
+                                                              j1979_dict=j1979_dictionary,
+                                                              signal_dict=signal_dictionary,
+                                                              correlation_threshold=min_correlation_threshold,
+                                                              force=force_signal_labeling)
+plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting)

-    # A timer class to record timings throughout the pipeline.
-    a_timer = PipelineTimer(verbose=True)
+#                     DATA STORAGE                      #
+if dump_to_pickle:
+    if force_pre_processing:
+        if path.isfile(pickle_arb_id_filename):
+            remove(pickle_arb_id_filename)
+        if path.isfile(pickle_j1979_filename):
+            remove(pickle_j1979_filename)
+    if force_lexical_analysis or force_signal_labeling:
+        if path.isfile(pickle_signal_filename):
+            remove(pickle_signal_filename)
+    if force_semantic_analysis:
+        if path.isfile(pickle_subset_filename):
+            remove(pickle_subset_filename)
+        if path.isfile(csv_correlation_filename):
+            remove(csv_correlation_filename)
+        if path.isfile(pickle_j1979_correlation):
+            remove(pickle_j1979_correlation)
+        if path.isfile(pickle_clusters_filename):
+            remove(pickle_clusters_filename)
+        if path.isfile(pickle_all_signal_filename):
+            remove(pickle_all_signal_filename)
+        if path.isfile(csv_all_signals_filename):
+            remove(csv_all_signals_filename)

-    #            DATA IMPORT AND PRE-PROCESSING             #
-    pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
-    id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
-                                                                               tang_normalize_strategy,
-                                                                               time_conversion,
-                                                                               freq_analysis_accuracy,
-                                                                               freq_synchronous_threshold,
-                                                                               force_pre_processing)
-    if j1979_dictionary:
-        plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
-
-
-    #                 LEXICAL ANALYSIS                     #
-    print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
-    tokenize_dictionary(a_timer,
-                        id_dictionary,
-                        force_lexical_analysis,
-                        include_padding=tokenize_padding,
-                        merge=True,
-                        max_distance=tokenization_bit_distance)
-    signal_dictionary = generate_signals(a_timer,
-                                         id_dictionary,
-                                         pickle_signal_filename,
-                                         signal_normalize_strategy,
-                                         force_lexical_analysis)
-    plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, i, force_arb_id_plotting)
-
-    #                  SEMANTIC ANALYSIS                    #
-    print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
-    subset_df = subset_selection(a_timer,
-                                 signal_dictionary,
-                                 pickle_subset_filename,
-                                 force_semantic_analysis,
-                                 subset_size=subset_selection_size)
-    corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
-    cluster_dict = greedy_signal_clustering(corr_matrix_subset,
-                                            correlation_threshold=min_correlation_threshold,
-                                            fuzzy_labeling=fuzzy_labeling)
-    df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
-                                                                pickle_clusters_filename=pickle_clusters_filename,
-                                                                pickle_all_signals_df_filename=pickle_all_signal_filename,
-                                                                csv_signals_correlation_filename=csv_all_signals_filename,
-                                                                signal_dict=signal_dictionary,
-                                                                cluster_dict=cluster_dict,
-                                                                correlation_threshold=min_correlation_threshold,
-                                                                force=force_semantic_analysis)
-    signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
-                                                                  j1979_corr_filename=pickle_j1979_correlation,
-                                                                  df_signals=df_full,
-                                                                  j1979_dict=j1979_dictionary,
-                                                                  signal_dict=signal_dictionary,
-                                                                  correlation_threshold=min_correlation_threshold,
-                                                                  force=force_signal_labeling)
-    plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, i, force_cluster_plotting)
-
-    #                     DATA STORAGE                      #
-    if dump_to_pickle:
-        if force_pre_processing:
-            if path.isfile(pickle_arb_id_filename):
-                remove(pickle_arb_id_filename)
-            if path.isfile(pickle_j1979_filename):
-                remove(pickle_j1979_filename)
-        if force_lexical_analysis or force_signal_labeling:
-            if path.isfile(pickle_signal_filename):
-                remove(pickle_signal_filename)
-        if force_semantic_analysis:
-            if path.isfile(pickle_subset_filename):
-                remove(pickle_subset_filename)
-            if path.isfile(csv_correlation_filename):
-                remove(csv_correlation_filename)
-            if path.isfile(pickle_j1979_correlation):
-                remove(pickle_j1979_correlation)
-            if path.isfile(pickle_clusters_filename):
-                remove(pickle_clusters_filename)
-            if path.isfile(pickle_all_signal_filename):
-                remove(pickle_all_signal_filename)
-            if path.isfile(csv_all_signals_filename):
-                remove(csv_all_signals_filename)
-
-        timer_flag = 0
-        if not path.exists(output_folder):
-            mkdir(output_folder)
-        chdir(output_folder)
-        if not path.isfile(pickle_arb_id_filename):
-            timer_flag += 1
-            print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
-            dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
-            print("\tComplete...")
-        if not path.isfile(pickle_j1979_filename):
-            timer_flag += 1
-            print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
-            dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
-            print("\tComplete...")
-        if not path.isfile(pickle_signal_filename):
-            timer_flag += 1
-            print("\nDumping signal dictionary to " + pickle_signal_filename)
-            dump(signal_dictionary, open(pickle_signal_filename, "wb"))
-            print("\tComplete...")
-        if not path.isfile(pickle_subset_filename):
-            timer_flag += 1
-            print("\nDumping signal subset list to " + pickle_subset_filename)
-            dump(subset_df, open(pickle_subset_filename, "wb"))
-            print("\tComplete...")
-        if not path.isfile(csv_correlation_filename):
-            timer_flag += 1
-            print("\nDumping subset correlation matrix to " + csv_correlation_filename)
-            corr_matrix_subset.to_csv(csv_correlation_filename)
-            print("\tComplete...")
-        if not path.isfile(pickle_j1979_correlation):
-            timer_flag += 1
-            print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
-            dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
-            print("\tComplete...")
-        if not path.isfile(pickle_clusters_filename):
-            timer_flag += 1
-            print("\nDumping cluster dictionary to " + pickle_clusters_filename)
-            dump(cluster_dict, open(pickle_clusters_filename, "wb"))
-            print("\tComplete...")
-        if not path.isfile(pickle_all_signal_filename):
-            timer_flag += 1
-            print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
-            dump(df_full, open(pickle_all_signal_filename, "wb"))
-            print("\tComplete...")
-        if not path.isfile(csv_all_signals_filename):
-            timer_flag += 1
-            print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
-            corr_matrix_full.to_csv(csv_all_signals_filename)
-            print("\tComplete...")
-        if timer_flag is 9:
-            print("\nDumping pipeline timer to " + pickle_timer_filename)
-            dump(a_timer, open(pickle_timer_filename, "wb"))
-            print("\tComplete...")
-        chdir("..")
+    timer_flag = 0
+    if not path.exists(output_folder):
+        mkdir(output_folder)
+    chdir(output_folder)
+    if not path.isfile(pickle_arb_id_filename):
+        timer_flag += 1
+        print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
+        dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_j1979_filename):
+        timer_flag += 1
+        print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
+        dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_signal_filename):
+        timer_flag += 1
+        print("\nDumping signal dictionary to " + pickle_signal_filename)
+        dump(signal_dictionary, open(pickle_signal_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_subset_filename):
+        timer_flag += 1
+        print("\nDumping signal subset list to " + pickle_subset_filename)
+        dump(subset_df, open(pickle_subset_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(csv_correlation_filename):
+        timer_flag += 1
+        print("\nDumping subset correlation matrix to " + csv_correlation_filename)
+        corr_matrix_subset.to_csv(csv_correlation_filename)
+        print("\tComplete...")
+    if not path.isfile(pickle_j1979_correlation):
+        timer_flag += 1
+        print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
+        dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_clusters_filename):
+        timer_flag += 1
+        print("\nDumping cluster dictionary to " + pickle_clusters_filename)
+        dump(cluster_dict, open(pickle_clusters_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_all_signal_filename):
+        timer_flag += 1
+        print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
+        dump(df_full, open(pickle_all_signal_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(csv_all_signals_filename):
+        timer_flag += 1
+        print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
+        corr_matrix_full.to_csv(csv_all_signals_filename)
+        print("\tComplete...")
+    if timer_flag is 9:
+        print("\nDumping pipeline timer to " + pickle_timer_filename)
+        dump(a_timer, open(pickle_timer_filename, "wb"))
+        print("\tComplete...")
+    chdir("..")
--- a/Pipeline/PipelineTimer.py
+++ b/Pipeline/PipelineTimer.py
--- a/Pipeline/Plotter.py
+++ b/Pipeline/Plotter.py
@ -16,10 +16,7 @@ cluster_folder: str = 'clusters'
 j1979_folder:   str = 'j1979'


-def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, settings: int, force: bool = False):
-    arb_id_folder = 'figures' + str(settings)
-
-
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False):
    if path.exists(arb_id_folder):
        if force:
            rmtree(arb_id_folder)
@ -32,7 +29,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
    for k_id, signals in signal_dict.items():
        arb_id = arb_id_dict[k_id]
        if not arb_id.static:
-            print(str(settings) + "Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
+            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
            a_timer.start_iteration_time()

            signals_to_plot = []
@ -102,9 +99,7 @@ def plot_signals_by_cluster(a_timer: PipelineTimer,
                            cluster_dict: dict,
                            signal_dict: dict,
                            use_j1979_tags: bool,
-                            settings: int,
                            force: bool=False):
-    cluster_folder = 'cluster' + str(settings)
    if path.exists(cluster_folder):
        if force:
            rmtree(cluster_folder)
--- a/Pipeline/PreProcessor.py
+++ b/Pipeline/PreProcessor.py
@ -44,7 +44,7 @@ class PreProcessor:
                             header=None,
                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
                             skiprows=7,
-                             delimiter=' ',
+                             delimiter='\t',
                             converters=convert_dict,
                             index_col=0)

--- a/Pipeline/SemanticAnalysis.py
+++ b/Pipeline/SemanticAnalysis.py
--- a/Pipeline/Signal.py
+++ b/Pipeline/Signal.py
--- a/Pipeline_multi-file/ArbID.py
+++ b/Pipeline_multi-file/ArbID.py
--- a/Pipeline_multi-file/FileBoi.py
+++ b/Pipeline_multi-file/FileBoi.py
@ -60,14 +60,13 @@ class FileBoi:
                    # Check if this file name matches the expected name for a CAN data sample. If so, create new Sample
                    m = re.match('loggerProgram[\d]+.log', file)
                    if m:
-                        i = 0
                        if not (make, model, year) in sample_dict:
                            sample_dict[(make, model, year)] = []
                        this_sample_index = str(len(sample_dict[(make, model, year)]))
                        this_sample = Sample(make=make, model=model, year=year, sample_index=this_sample_index,
-                                            sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n)
+                                             sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n)
                        sample_dict[(make, model, year)].append(this_sample)
-                    current_vehicle = []
+                current_vehicle = []
            else:
                if this_dir == "Captures":
                    continue
--- a/Pipeline_multi-file/J1979.py
+++ b/Pipeline_multi-file/J1979.py
--- a/Pipeline_multi-file/KnownSignalAnalysis.py
+++ b/Pipeline_multi-file/KnownSignalAnalysis.py
@ -1,130 +0,0 @@
-from numpy import float64, nditer, uint64, zeros, ndarray, inf
-from pandas import Series, DataFrame
-from os import path, remove
-from pickle import load
-from ArbID import ArbID
-from Signal import Signal
-from PipelineTimer import PipelineTimer
-from typing import List
-from scipy import integrate
-
-
-def transform_signal(a_timer: PipelineTimer,
-                     arb_id_dict: dict,
-                     signal_dict: dict,
-                     transform_pickle_filename: str,
-                     normalize_strategy,
-                     given_arb_id: int,
-                     force=False):
-    if force and path.isfile(transform_pickle_filename):
-        remove(transform_pickle_filename)
-    if path.isfile(transform_pickle_filename):
-            print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
-            return load(open(transform_pickle_filename, "rb"))
-
-    a_timer.start_function_time()
-
-    transform_dict = signal_dict
-
-    # arb_id_dict[given_arb_id * 256] = ArbID(given_arb_id * 256)
-
-    for k, arb_id in arb_id_dict.items():
-        # print(str(arb_id.id) + " == " + str(given_arb_id) + " ?\n")
-        if arb_id.id == given_arb_id:
-            arb_id.static = False
-            arb_id.short = False
-            if not arb_id.static:
-                for token in arb_id.tokenization:
-                    a_timer.start_iteration_time()
-
-                    signal = Signal(k * 256, token[0], token[1])
-                    signal.static = False
-
-
-
-                    # Convert the binary ndarray to a list of string representations of each row
-                    temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
-                    temp2 = zeros((temp1.__len__()+1), dtype=uint64)
-                    # convert each string representation to int
-                    for i, row in enumerate(temp1):
-                        temp2[i] = int(row, 2)
-
-                    temp3 = integrate.cumtrapz(temp2)
-                    print("Arb Id " + str(k) + ", Signal from " + str(token[0]) + " to  " + str(token[1]) + " Integrated successfully")
-
-
-
-                    # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
-                    signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
-
-
-
-                    # Normalize the signal and update its meta-data
-                    signal.normalize_and_set_metadata(normalize_strategy)
-                    # add this signal to the signal dictionary which is keyed by Arbitration ID
-                    if (k * 256) in transform_dict:
-                        transform_dict[k * 256][(arb_id.id * 256, signal.start_index, signal.stop_index)] = signal
-                    else:
-                        print("Successfully added at transform dict")
-                        transform_dict[k * 256] = {(arb_id.id * 256, signal.start_index, signal.stop_index): signal}
-
-                    a_timer.set_token_to_signal()
-
-    a_timer.set_signal_generation()
-
-    return transform_dict
-
-
-def transform_signals(a_timer: PipelineTimer,
-                      arb_id_dict: dict,
-                      transform_pickle_filename: str,
-                      normalize_strategy,
-                      force=False):
-    if force and path.isfile(transform_pickle_filename):
-        remove(transform_pickle_filename)
-    if path.isfile(transform_pickle_filename):
-            print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
-            return load(open(transform_pickle_filename, "rb"))
-
-    a_timer.start_function_time()
-
-    transform_dict = {}  # arb_id_dict
-
-    for k, arb_id in arb_id_dict.items():
-        if not arb_id.static:
-            for token in arb_id.tokenization:
-                a_timer.start_iteration_time()
-
-                signal = Signal(k * 256, token[0], token[1])
-
-
-
-                # Convert the binary ndarray to a list of string representations of each row
-                temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
-                temp2 = zeros((temp1.__len__()+1), dtype=uint64)
-                # convert each string representation to int
-                for i, row in enumerate(temp1):
-                    temp2[i] = int(row, 2)
-
-                temp3 = integrate.cumtrapz(temp2)
-
-
-
-                # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
-                signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
-
-
-
-                # Normalize the signal and update its meta-data
-                signal.normalize_and_set_metadata(normalize_strategy)
-                # add this signal to the signal dictionary which is keyed by Arbitration ID
-                if k in transform_dict:
-                    transform_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
-                else:
-                    transform_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
-
-                a_timer.set_token_to_signal()
-
-    a_timer.set_signal_generation()
-
-    return transform_dict
--- a/Pipeline_multi-file/LexicalAnalysis.py
+++ b/Pipeline_multi-file/LexicalAnalysis.py
--- a/Pipeline_multi-file/Main.py
+++ b/Pipeline_multi-file/Main.py
@ -5,22 +5,14 @@ from Sample import Sample
 # Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING?
 kfold_n: int = 5
 current_vehicle_number = 0
-known_speed_arb_id = 514

 good_boi = FileBoi()
 samples = good_boi.go_fetch(kfold_n)
 for key, sample_list in samples.items():  # type: tuple, list
    for sample in sample_list:  # type: Sample
-
-        # sample.tang_inversion_bit_dist += (0.01 * current_vehicle_number)
-        # sample.max_inter_cluster_dist += (0.01 * current_vehicle_number)
-        # sample.tang_inversion_bit_dist = round(sample.tang_inversion_bit_dist, 2)  # removes floating point errors
-        # sample.max_inter_cluster_dist = round(sample.max_inter_cluster_dist, 2)
-        # print("\n\t##### Settings are " + str(sample.tang_inversion_bit_dist) + " and " + str(
-            # sample.max_inter_cluster_dist) + " #####")
-
+        print(current_vehicle_number)
        print("\nData import and Pre-Processing for " + sample.output_vehicle_dir)
-        id_dict, j1979_dict = sample.pre_process(known_speed_arb_id)
+        id_dict, j1979_dict = sample.pre_process()
        if j1979_dict:
            sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number))

@ -33,22 +25,14 @@ for key, sample_list in samples.items():  # type: tuple, list
        print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
        sample.tokenize_dictionary(id_dict)
        signal_dict = sample.generate_signals(id_dict, bool(j1979_dict))
-        # sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
+        sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))

-        #                 KNOWN SIGNAL ANALYSIS                  #
-        print("\n\t##### BEGINNING KNOWN SIGNAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
-        transform_dict= sample.transform_signal(id_dict, signal_dict, known_speed_arb_id)
-        sample.plot_arb_ids(id_dict, transform_dict, vehicle_number=str(current_vehicle_number))
-
-
-        #                 SEMANTIC ANALYSIS                     #
+        #                 LEXICAL ANALYSIS                     #
        print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####")
-        corr_matrix, combined_df = sample.generate_correlation_matrix(transform_dict)
+        corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict)
        if j1979_dict:
-            transform_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, transform_dict, combined_df)
+            signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df)
        cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix)
-        # sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
-        sample.plot_known_signal_cluster(cluster_dict, signal_dict, bool(j1979_dict), known_speed_arb_id, vehicle_number=str(current_vehicle_number))
+        sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
        sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number))
        current_vehicle_number += 1
-
--- a/Pipeline_multi-file/PipelineTimer.py
+++ b/Pipeline_multi-file/PipelineTimer.py
--- a/Pipeline_multi-file/Plotter.py
+++ b/Pipeline_multi-file/Plotter.py
@ -25,13 +25,13 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
            rmtree(arb_id_folder)
        else:
            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
-            # return
+            return

    a_timer.start_function_time()

    for k_id, signals in signal_dict.items():
        arb_id = arb_id_dict[k_id]
-        if (not arb_id.static and not arb_id.short) or k_id == 155136:
+        if not arb_id.static and not arb_id.short:
            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
            a_timer.start_iteration_time()

@ -85,7 +85,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
            chdir(arb_id_folder)

            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
-            savefig(hex(signal.arb_id) + "." + figure_format,
+            savefig(hex(arb_id.id) + "." + figure_format,
                    bbox_iches='tight',
                    pad_inches=0.0,
                    dpi=figure_dpi,
@ -311,162 +311,3 @@ def plot_dendrogram(a_timer: PipelineTimer,
            transparent=figure_transp)
    plt.close()
    print("\t\tComplete...")
-
-
-def plot_known_signal_cluster(a_timer: PipelineTimer,
-                              cluster_dict: dict,
-                              signal_dict: dict,
-                              use_j1979_tags: bool,
-                              vehicle_number: str,
-                              given_arb_id: int,
-                              force: bool = False):
-    if path.exists(cluster_folder):
-        if force:
-            rmtree(cluster_folder)
-        else:
-            print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
-            return
-
-    a_timer.start_function_time()
-
-    print("\n")
-    for cluster_number, list_of_signals in cluster_dict.items():
-        if [v for i, v in enumerate(list_of_signals) if (v[0] == given_arb_id or v[0] == given_arb_id * 256)]:
-            print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
-            a_timer.start_iteration_time()
-
-            # Setup the plot
-            fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
-            plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number,
-                         weight='bold',
-                         position=(0.5, 1))
-            fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
-
-            size_adjust = len(list_of_signals) / 100
-            # The min() statement provides whitespace for the suptitle depending on the number of subplots.
-            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
-            # This adjusts whitespace padding on the left and right of the subplots
-            fig.subplots_adjust(left=0.07, right=0.98)
-
-            # Plot the time series of each signal in the cluster
-            for i, signal_key in enumerate(list_of_signals):
-                signal = signal_dict[signal_key[0]][signal_key]
-                ax = axes[i, 0]
-                if signal.j1979_title and use_j1979_tags:
-                    this_title = signal.plot_title + " [" + signal.j1979_title + \
-                                 " (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
-                else:
-                    this_title = signal.plot_title
-                ax.set_title(this_title,
-                             style='italic',
-                             size='medium')
-                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
-                ax.plot(signal.time_series, color='black')
-
-            if not path.exists(cluster_folder):
-                mkdir(cluster_folder)
-            chdir(cluster_folder)
-
-            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
-            if len(list_of_signals) < 100:  # prevents errors when given too low a setting for correlation
-                savefig("cluster_" + str(cluster_number) + "." + figure_format,
-                        bbox_iches='tight',
-                        pad_inches=0.0,
-                        dpi=figure_dpi,
-                        format=figure_format,
-                        transparent=figure_transp)
-            else:
-                print("Too many clusters to plot! Skipping...")
-
-            chdir("..")
-
-            plt.close(fig)
-
-            a_timer.set_plot_save_cluster()
-            print("\tComplete...")
-
-    a_timer.set_plot_save_cluster_dict()
-
-
-def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str,
-                           force: bool=False):
-    if path.exists(arb_id_folder):
-        if force:
-            rmtree(arb_id_folder)
-        else:
-            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
-            # return
-
-    a_timer.start_function_time()
-
-    for k_id, signals in signal_dict.items():
-        arb_id = arb_id_dict[k_id]
-        if (not arb_id.static and not arb_id.short) or k_id == 155136:
-            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
-            a_timer.start_iteration_time()
-
-            signals_to_plot = []
-            # Don't plot the static signals
-            for k_signal, signal in signals.items():
-                if not signal.static:
-                    signals_to_plot.append(signal)
-            # There's a corner case where the Arb ID only has static signals. This conditional accounts for this.
-            # TODO: This corner case should probably be reflected by arb_id.static.
-            if len(signals_to_plot) < 1:
-                continue
-            # One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors.
-            fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
-            plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number,
-                         weight='bold',
-                         position=(0.5, 1))
-            fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
-            # The min() statement provides whitespace for the title depending on the number of subplots.
-            size_adjust = len(signals_to_plot) / 100
-            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
-            # This adjusts whitespace padding on the left and right of the subplots
-            fig.subplots_adjust(left=0.07, right=0.98)
-            for i, signal in enumerate(signals_to_plot):
-                ax = axes[i]
-                ax.set_title(signal.plot_title,
-                             style='italic',
-                             size='medium')
-                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
-                ax.plot(signal.time_series, color='black')
-                # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
-                axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
-
-            # Plot the entropy gradient at the bottom of the overall output
-            ax = axes[-1]
-            ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
-                         style='italic',
-                         size='medium')
-            tang_bit_width = arb_id.tang.shape[0]
-            ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
-            y = arb_id.tang[:]
-            # Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
-            ix = isin(y, 0)
-            pad_bit = where(ix)
-            non_pad_bit = where(~ix)
-            ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
-            ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
-
-            if not path.exists(arb_id_folder):
-                mkdir(arb_id_folder)
-            chdir(arb_id_folder)
-
-            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
-            savefig(hex(signal.arb_id) + "." + figure_format,
-                    bbox_iches='tight',
-                    pad_inches=0.0,
-                    dpi=figure_dpi,
-                    format=figure_format,
-                    transparent=figure_transp)
-
-            chdir("..")
-
-            plt.close(fig)
-
-            a_timer.set_plot_save_arb_id()
-            print("\tComplete...")
-
-    a_timer.set_plot_save_arb_id_dict()
--- a/Pipeline_multi-file/PreProcessor.py
+++ b/Pipeline_multi-file/PreProcessor.py
@ -1,4 +1,4 @@
-from pandas import DataFrame, read_csv, Series, concat
+from pandas import DataFrame, read_csv, Series
 from numpy import int64
 from os import path, remove, getcwd
 from pickle import load
@ -45,7 +45,7 @@ class PreProcessor:
                             header=None,
                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
                             skiprows=7,
-                             delimiter=' ',
+                             delimiter='\t',
                             converters=convert_dict,
                             index_col=0)

@ -70,7 +70,6 @@ class PreProcessor:
                                   time_conversion:             int = 1000,
                                   freq_analysis_accuracy:      float = 0.0,
                                   freq_synchronous_threshold:  float = 0.0,
-                                   given_arb_id:                int = 0,
                                   force:                       bool = False) -> (dict, dict):
        id_dictionary = {}
        j1979_dictionary = {}
@ -93,11 +92,6 @@ class PreProcessor:
            return id_dictionary, j1979_dictionary
        else:
            self.import_csv(a_timer, self.data_filename)
-            this_id = self.data.loc[self.data['id'] == given_arb_id].copy()
-            this_id.id = given_arb_id * 256
-
-            combined = concat([self.data, this_id])
-            self.data = combined

        a_timer.start_function_time()

--- a/Pipeline_multi-file/Sample.py
+++ b/Pipeline_multi-file/Sample.py
@ -2,7 +2,7 @@ from PreProcessor import PreProcessor
 from Validator import Validator
 from LexicalAnalysis import tokenize_dictionary, generate_signals
 from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling
-from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram, plot_known_signal_cluster
+from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram
 from sklearn.preprocessing import minmax_scale
 from typing import Callable
 from PipelineTimer import PipelineTimer
@ -11,8 +11,6 @@ from pickle import dump, load
 from numpy import ndarray, zeros, float16
 from pandas import DataFrame

-from KnownSignalAnalysis import transform_signals, transform_signal
-
 # File names for the on-disc data input and output.
 output_folder:              str = 'output'
 pickle_arb_id_filename:     str = 'pickleArbIDs.p'
@ -28,8 +26,6 @@ pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p'
 csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
 pickle_timer_filename:      str = 'pickleTimer.p'

-pickle_transform_filename:  str = 'pickleTransform'
-
 dump_to_pickle:             bool = True

 # Change out the normalization strategies as needed.
@ -43,11 +39,9 @@ force_threshold_plotting:   bool = False
 force_j1979_plotting:       bool = True
 use_j1979:                  bool = True

-force_transform:            bool = False
-
 force_lexical_analysis:     bool = False
 force_signal_generation:    bool = False
-force_arb_id_plotting:      bool = False
+force_arb_id_plotting:      bool = True

 force_correlation_matrix:   bool = False
 force_clustering:           bool = False
@ -64,15 +58,16 @@ freq_synchronous_threshold = 0.1

 # Threshold parameters used during lexical analysis.
 tokenization_bit_distance:  float = 0.2
-tokenize_padding:           bool = False  # changing this to false seems to help better find weak signals
+tokenize_padding:           bool = True
 merge_tokens:               bool = True

 # Threshold parameters used during semantic analysis
 subset_selection_size:      float = 0.25
-max_intra_cluster_distance: float = 0.10  # normally 0.25
+max_intra_cluster_distance: float = 0.20
 min_j1979_correlation:      float = 0.85
 # fuzzy_labeling:             bool = True

+
 # A timer class to record timings throughout the pipeline.
 a_timer = PipelineTimer(verbose=True)

@ -117,7 +112,7 @@ class Sample:
        # Move back to root of './output/make_model_year/sample_index/"
        chdir("../../../")

-    def pre_process(self, given_arb_id):
+    def pre_process(self):
        self.make_and_move_to_vehicle_directory()
        pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979)
        id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
@ -125,7 +120,6 @@ class Sample:
                                                                                   time_conversion,
                                                                                   freq_analysis_accuracy,
                                                                                   freq_synchronous_threshold,
-                                                                                   given_arb_id,
                                                                                   force_pre_processing)
        if dump_to_pickle:
            if force_pre_processing:
@ -309,37 +303,3 @@ class Sample:
        plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist,
                        vehicle_number=vehicle_number, force=force_dendrogram_plotting)
        self.move_back_to_parent_directory()
-
-    def transform_signals(self, id_dictionary: dict):
-        self.make_and_move_to_vehicle_directory()
-        transform_dict = transform_signals(a_timer=a_timer,
-                                           arb_id_dict=id_dictionary,
-                                           transform_pickle_filename=pickle_transform_filename,
-                                           normalize_strategy=signal_normalize_strategy,
-                                           force=force_transform)
-        self.move_back_to_parent_directory()
-        return transform_dict
-
-    def transform_signal(self, id_dictionary: dict, signal_dict: dict, arb_id: int):
-        self.make_and_move_to_vehicle_directory()
-        transform_dict = transform_signal(a_timer=a_timer,
-                                          arb_id_dict=id_dictionary,
-                                          signal_dict=signal_dict,
-                                          transform_pickle_filename=pickle_transform_filename,
-                                          normalize_strategy=signal_normalize_strategy,
-                                          given_arb_id=arb_id,
-                                          force=force_transform)
-        self.move_back_to_parent_directory()
-        return transform_dict
-
-    def plot_known_signal_cluster(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool,
-                      known_signal: int, vehicle_number: str):
-        self.make_and_move_to_vehicle_directory()
-        plot_known_signal_cluster(a_timer=a_timer,
-                                cluster_dict=cluster_dictionary,
-                                signal_dict=signal_dictionary,
-                                use_j1979_tags=use_j1979_tags,
-                                vehicle_number=vehicle_number,
-                                given_arb_id=known_signal,
-                                force=force_cluster_plotting)
-        self.move_back_to_parent_directory()
--- a/Pipeline_multi-file/SemanticAnalysis.py
+++ b/Pipeline_multi-file/SemanticAnalysis.py
@ -1,5 +1,5 @@
 from pandas import concat, DataFrame, read_csv
-from numpy import ndarray, zeros, clip
+from numpy import ndarray, zeros
 from os import path, remove
 from pickle import load, dump
 from ast import literal_eval
@ -77,7 +77,7 @@ def signal_clustering(corr_matrix:      DataFrame,
    corr_matrix.where(corr_matrix > 0, 0, inplace=True)
    corr_matrix = 1 - corr_matrix
    X = corr_matrix.values  # type: ndarray
-    Y = clip(ssd.squareform(X), 0, None)
+    Y = ssd.squareform(X)
    # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
    Z = linkage(Y, method='single', optimal_ordering=True)
    fclus = fcluster(Z, t=threshold, criterion='distance')
--- a/Pipeline_multi-file/Signal.py
+++ b/Pipeline_multi-file/Signal.py
--- a/Pipeline_multi-file/Validator.py
+++ b/Pipeline_multi-file/Validator.py
--- a/Pipeline_multi-file/maximize_sum_shannon.py
+++ b/Pipeline_multi-file/maximize_sum_shannon.py
--- a/README.md
+++ b/README.md
@ -1,32 +1,81 @@
-To see the original README please go view the project this is forked from, brent-stone/CAN_Reverse_Engineering
- - The vast majority of this project is his work.
+# Automated CAN Payload Reverse Engineering

-This fork of CAN_Reverse_Engineering adds **KnownSignalAnalysis.py**, which takes a single given ARB ID and integrates it.
-This given Arb ID is defined in **Main.py** as known_speed_arb_id. This integration is done because the given Arb ID is speed, and the integral of speed is distance.
-Therefore, once normalized, the integrated speed signal on the given Arb ID should be extremely similar to the odometer signal.
+## NOTICE
+> The views expressed in this document and code are those of the author and do not reflect the official policy or position of the United States Air Force, the United States Army, the United States Department of Defense or the United States Government. This material is declared a work of the U.S. Government and is not subject to copyright protection in the United States. Approval for public disclosure of this code was approved by the 88th Air Base Wing Public Affairs on 08 March 2019 under case number 88ABW-2019-0910. Unclassified disclosure of the dissertation was approved on 03 January 2019 under case number 88ABW-2019-0024.
+-----------------------------------------------------------------------------------------

-I will do my best to detail my changes below. In the future the changes will be listed on the commits of the relevant file.
+This project houses Python and R scripts intended to facilitate the automated reverse engineering of Controller Area Network (CAN) payloads observed from passenger vehicles. This code was originally developed by Dr. Brent Stone at the Air Force Institute of Technology in pursuit of a Doctor of Philosophy in Computer Science. Please see the included dissertation titled "Enabling Auditing and Intrusion Detection for Proprietary Controller Area Networks" for details about the methods used. Please open an issue letting me know if you find any typos, bad grammar, your copyrighted images you want removed, or other issues!

-In the function generate_arb_id_dictionary(), located in **PreProcessor.py**, once the known_speed_arb_id is found, it creates a new arb id that is an exact copy.
-This exact copy has its' this_id.id set to 256 times the original's, purely because this adds 2 zeroes to the end of the arb id so I wouldn't have to go and mess with the plotting function filenames.
+Special thank you to Dave Blundell, co-author of the Car Hacker's Handbook, and the Open Garages community for technical advice and serving as a sounding board.

-Next, the transform_signal() function located in **KnownSignalAnalysis.py**, which largely shares its code with the function generate_signals() in **LexicalAnalysis.py**.
-transform_signal() finds the Arb ID created in generate_arb_id_dictionary() and integrates it's component signals.
+## Tips and Advice
+These scripts won't run immediately when cloning this repo. Hopefully these tips will save you time and frustration saying "WHY WONT THESE THINGS WORK!?!?!" Please ask questions by posting in the [Open Garages Google group](https://groups.google.com/forum/#!forum/open-garages). These scripts were developed and tested using Python 3.6. Please make sure you have the Numpy, Pandas, & scikit-learn packages available to your Python Interpreter.


-During Semantic Analysis, the known integrated speed signal should be clustered with the unknown odometer signal. 
+The files are organized with an example CAN data sample and three folders. Each folder is a self-contained set of interdependent Python classes or R scripts for examining CAN data in the format shown in the example LoggerProgram0.log. Different file formats can be used by adjusting PreProcessor.py accordingly.

-Finally, there are obviously changes in **Main.py**, **Sample.py**, and **Plotter.py** to implement these functions. Specifically there are some new plotting functions to only plot the transformed signal.
+* Folder 1: **Pipeline**
+  * Simply copy LoggerProgram0.log into this folder and run homy.py.
+  * This is the most basic implementation of the pipeline described in the dissertation. Over 80% of the code is referenced from home.py. Follow the calls made in home.py to see how the data are sequentially processed and saved to disk.
+  * The remaining 20% is unused portions of code which were left in place to either serve as a reference for different ways of doing things in Python or interesting experiments which were worth preserving (like the Smith-Waterman search).

-I have tested this on 3 vehicles, and it has been successful on 2 of them. The unsuccessful case is likely due to some sort of encoding scheme on the odometer signal.
+* Folder 2: **Pipeline_multi-file**
+  * This is the most complete and robust implementation of the concepts presented in the dissertation; however, the code is also more complicated to enable automated processing of many CAN data samples at one time. If you aren't already very comfortable with Python and Pandas, make sure you understand how the scripts in the **Pipeline** folder work before attempting to go through this expanded version of the code.

-I am new to using git, and this is my first major python task after 5 years of coding in C, so please excuse if there are any C-isms in my python code or mistakes in the maintenance of the GitHub.
-In addition, I mostly focused on throwing this together as fast as possible and there is a huge amount of sloppy coding & technical debt.
+  * This folder includes the same classes from **Pipeline**. However, **SOME BUGS WERE FIXED HERE** but **NOT** in the classes saved in **Pipeline**. If a generous soul wants to transplant the fixes back into **Pipeline**, I will happily merge the fork.

-I will probably not be maintaining this project moving forward due to the rushed implementation. It's more of a proof of concept. I plan to rewrite this more properly soon.
-If you are interested in forking this project or making a pull request, it's probably best if you just wait until the rewrite.
+  * Make sure you read the comments about the expected folder structure!

-Feel free to email me at jarking@umich.edu with any questions!
+* Folder 3: **R Scripts**
+  * The R scripts require the [rEDM](https://cran.r-project.org/web/packages/rEDM/vignettes/rEDM-tutorial.html) package. Look for commands_list.txt for a sequential series of R commands.

- 
+  * The folders "city" and "home" include .csv files of engine RPM, brake pressure, and vehicle speed time series during different driving conditions. Each folder includes a "commands_list_####.txt" file for copy-paste R commands to analyze this data using the rEDM package.

+  * .Rda files and .pdf graphical output are examples of output using the R commands and provided .csv data.
+  
+  
+## Script specific information by folder
+### Pipeline
+**Input**: CAN data in the format demonstrated in LoggerProgram0.log
+* **Main.py**
+  1. **Purpose**: This script links and calls all remaining scripts in this folder. It handles some ‘global’ variables used for modifying the flow of data between scripts as well as any files output to the local hard disk.
+* **PreProcessor.py**
+  1. **Purpose**: This script is responsible for reading in .log files and converting them to a runtime data structure known as a Pandas Data Frame. Some ‘data cleaning’ is also performed by this script. The output is a dictionary data structure containing ArbID runtime objects based on the class defined in **ArbID.py**. **J1979.py** is called to attempt to identify and extract data in the Data Frame related to the SAE J1979 standard. J1979 is a public communications standard so this data does not need to be specially analyzed by the following scripts.
+* **LexicalAnalysis.py**
+  1. **Purpose**: This script is responsible for making an educated guess about the time series data present in the Data Frame and ArbID dictionary created by **PreProcessor.py**. Individual time series are recorded using a dictionary of Signal runtime objects based on the class defined in **Signal.py**.
+* **SemanticAnalysis.py**
+  1. **Purpose**: This script generates a correlation matrix of Signal time series produced by **LexicalAnalysis.py**. That correlation matrix is then used to cluster Signal time series using an open source implementation of a Hierarchical Clustering algorithm.
+* **Plotter.py**
+  1. **Purpose**: This script uses an open source plotting library to produce visualizations of the groups of Signal time series and J1979 time series produced by the previous scripts.
+
+**Output**: This series of scripts produces an array of output depending on the global variables defined in **Main.py**. This output may include the following:
+*	‘Pickle’ files of the runtime dictionary and Data Frame objects using the open source Pickle library for Python. These files simply speed up repeated execution of the Python scripts when the same .log file is used for input to **Main.py**.
+* Comma separated value (.csv) plain text files of the correlation matrix between time series data present in the .log file.
+* Graphics of scatter-plots of the time series present in the .log file.
+* A graphic of the dendrogram produced during Hierarchical Clustering in **SemanticAnalysis.py**. A dendrogram is a well-documented method for visualizing the results of Hierarchical Clustering algorithms.
+
+
+### Pipeline_multi-file
+**Input**: CAN data in the format demonstrated in LoggerProgram0.log. 
+* **Main.py** and the other identically named scripts from **Pipeline** have been updated to allow the scripts to automatically import and process multiple .log files.
+* **FileBoi.py**
+  1. **Purpose**: This is a series of functions which handle the logistics of searching for and reading in data from multiple .log files.
+* **Sample.py**
+  1. **Purpose**: Much of the functionality present in **Main.py** in **Pipeline** has been moved into this script. This works in conjunction with **FileBoi.py** to handle the logistics of working with multiple .log files.
+* **SampleStats.py**
+  1. **Purpose**: This script produces and records a series of basic statistics about a particular .log file.
+* **Validator.py**
+  1. **Purpose**: This script performs a common machine learning validation technique called a ‘train-test split’ to quantify the consistency of the output of **LexicalAnalysis.py** and **SemanticAnalysis.py**. This was used in conjunction with **SampleStats.py** to produce quantifiable findings for research papers and the dissertation.
+**Output**: The output of **Pipeline_multi-file** is the same as **Pipeline** but organized according to the file structure used to store the set of .log files used as input. **SampleStats.py** and **Validator.py** also produce some additional statistical metrics regarding each .log file.
+
+### R
+**Input**: Plain-text .csv files containing time series data such as those included in this folder. 
+* **commands_list.txt, commands_list_city.txt, commands_list_home.txt**
+  1. **Purpose**: This is a list of R commands for the publically available rEDM package. The intent is to perform analysis of the time series according to the rEDM user guide. Each version is highly similar and customized only to point to a different .csv file for input and .pdf file to visualize the output.
+
+
+**Output**:
+* .Rda files
+  1. **Purpose**: These are machine readable files for storing R Data Frame objects to disk. All of these files were generated using the operations listed in commands_list.txt, commands_list_city.txt, commands_list_home.txt, and the provided .csv files.
+* .pdf files
+  1. **Purpose**: These are visualizations of the output of the R commands using the provided .csv files.