First Commit. Details to follow; I'm still learning.

2019-08-27 13:35:21 -04:00 · 2019-08-27 13:35:21 -04:00 · 3749d4f303
parent ca184a56de
commit 3749d4f303
29 changed files with 608 additions and 197 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,2 @@
+# Default ignored files
+/workspace.xml
--- a/.idea/CAN_Reverse_Engineering_git.iml
+++ b/.idea/CAN_Reverse_Engineering_git.iml
@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/Pipeline_multi-file" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/Example_Pipeline_Output" />
+      <excludeFolder url="file://$MODULE_DIR$/Pipeline" />
+      <excludeFolder url="file://$MODULE_DIR$/R" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.7 (CAN_Reverse_Engineering)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (CAN_Reverse_Engineering)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/CAN_Reverse_Engineering_git.iml" filepath="$PROJECT_DIR$/.idea/CAN_Reverse_Engineering_git.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/Pipeline/ArbID.py
+++ b/Pipeline/ArbID.py
--- a/Pipeline/J1979.py
+++ b/Pipeline/J1979.py
--- a/Pipeline/LexicalAnalysis.py
+++ b/Pipeline/LexicalAnalysis.py
--- a/Pipeline/Main.py
+++ b/Pipeline/Main.py
@ -9,11 +9,21 @@ from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal
 from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster
 from PipelineTimer import PipelineTimer

+i = 0
+j = 0
 # File names for the on-disc data input and output.
 # Input:
-can_data_filename:          str = 'drive_runway_afit.log'
-# can_data_filename:          str = 'loggerProgram0.log'
+#can_data_filename:          str = 'drive_runway_afit.log'
+can_data_filename:          str = 'loggerProgram0.log'

+while i < 51:
+    if i == 50 and j < 50: #i need to optimize this and redesign it
+        j += 1
+        i = 0
+    elif i == 50 and j == 50:
+        i = 51
+    else:
+        i += 1
    # Output:
    output_folder:              str = 'output'
    pickle_arb_id_filename:     str = 'pickleArbIDs.p'
@ -33,15 +43,15 @@ signal_normalize_strategy:  Callable = minmax_scale

    # Turn on or off portions of the pipeline and output methods using these flags.
    force_pre_processing:       bool = False
-force_j1979_plotting:       bool = False
+    force_j1979_plotting:       bool = True

-force_lexical_analysis:     bool = False
+    force_lexical_analysis:     bool = True
    force_arb_id_plotting:      bool = True

-force_semantic_analysis:    bool = False
-force_signal_labeling:      bool = False
+    force_semantic_analysis:    bool = True
+    force_signal_labeling:      bool = True
    use_j1979_tags_in_plots:    bool = True
-force_cluster_plotting:     bool = False
+    force_cluster_plotting:     bool = True

    dump_to_pickle:             bool = True

@ -51,12 +61,14 @@ z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
    freq_analysis_accuracy = z_lookup[0.9]
    freq_synchronous_threshold = 0.1

-# Threshold parameters used during lexical analysis.
-tokenization_bit_distance:  float = 0.2
+    # Threshold parameters used during lexical analysis. Default is 0.2
+    tokenization_bit_distance:  float = i/100
    tokenize_padding:           bool = True

-# Threshold parameters used during semantic analysis
-subset_selection_size:      float = 0.25
+
+
+    # Threshold parameters used during semantic analysis Default is 0.25 and 0.85
+    subset_selection_size:      float = j/100
    fuzzy_labeling:             bool = True
    min_correlation_threshold:  float = 0.85

@ -88,7 +100,7 @@ signal_dictionary = generate_signals(a_timer,
                                         pickle_signal_filename,
                                         signal_normalize_strategy,
                                         force_lexical_analysis)
-plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting)
+    plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, i, force_arb_id_plotting)

    #                  SEMANTIC ANALYSIS                    #
    print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
@ -116,7 +128,7 @@ signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
                                                                  signal_dict=signal_dictionary,
                                                                  correlation_threshold=min_correlation_threshold,
                                                                  force=force_signal_labeling)
-plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting)
+    plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, i, force_cluster_plotting)

    #                     DATA STORAGE                      #
    if dump_to_pickle:
--- a/Pipeline/PipelineTimer.py
+++ b/Pipeline/PipelineTimer.py
--- a/Pipeline/Plotter.py
+++ b/Pipeline/Plotter.py
@ -16,7 +16,10 @@ cluster_folder: str = 'clusters'
 j1979_folder:   str = 'j1979'


-def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False):
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, settings: int, force: bool = False):
+    arb_id_folder = 'figures' + str(settings)
+
+
    if path.exists(arb_id_folder):
        if force:
            rmtree(arb_id_folder)
@ -29,7 +32,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
    for k_id, signals in signal_dict.items():
        arb_id = arb_id_dict[k_id]
        if not arb_id.static:
-            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
+            print(str(settings) + "Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
            a_timer.start_iteration_time()

            signals_to_plot = []
@ -99,7 +102,9 @@ def plot_signals_by_cluster(a_timer: PipelineTimer,
                            cluster_dict: dict,
                            signal_dict: dict,
                            use_j1979_tags: bool,
+                            settings: int,
                            force: bool=False):
+    cluster_folder = 'cluster' + str(settings)
    if path.exists(cluster_folder):
        if force:
            rmtree(cluster_folder)
--- a/Pipeline/PreProcessor.py
+++ b/Pipeline/PreProcessor.py
@ -44,7 +44,7 @@ class PreProcessor:
                             header=None,
                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
                             skiprows=7,
-                             delimiter='\t',
+                             delimiter=' ',
                             converters=convert_dict,
                             index_col=0)

--- a/Pipeline/SemanticAnalysis.py
+++ b/Pipeline/SemanticAnalysis.py
--- a/Pipeline/Signal.py
+++ b/Pipeline/Signal.py
--- a/Pipeline_multi-file/ArbID.py
+++ b/Pipeline_multi-file/ArbID.py
--- a/Pipeline_multi-file/FileBoi.py
+++ b/Pipeline_multi-file/FileBoi.py
@ -60,6 +60,7 @@ class FileBoi:
                    # Check if this file name matches the expected name for a CAN data sample. If so, create new Sample
                    m = re.match('loggerProgram[\d]+.log', file)
                    if m:
+                        i = 0
                        if not (make, model, year) in sample_dict:
                            sample_dict[(make, model, year)] = []
                        this_sample_index = str(len(sample_dict[(make, model, year)]))
--- a/Pipeline_multi-file/J1979.py
+++ b/Pipeline_multi-file/J1979.py
--- a/Pipeline_multi-file/KnownSignalAnalysis.py
+++ b/Pipeline_multi-file/KnownSignalAnalysis.py
@ -0,0 +1,130 @@
+from numpy import float64, nditer, uint64, zeros, ndarray, inf
+from pandas import Series, DataFrame
+from os import path, remove
+from pickle import load
+from ArbID import ArbID
+from Signal import Signal
+from PipelineTimer import PipelineTimer
+from typing import List
+from scipy import integrate
+
+
+def transform_signal(a_timer: PipelineTimer,
+                     arb_id_dict: dict,
+                     signal_dict: dict,
+                     transform_pickle_filename: str,
+                     normalize_strategy,
+                     given_arb_id: int,
+                     force=False):
+    if force and path.isfile(transform_pickle_filename):
+        remove(transform_pickle_filename)
+    if path.isfile(transform_pickle_filename):
+            print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
+            return load(open(transform_pickle_filename, "rb"))
+
+    a_timer.start_function_time()
+
+    transform_dict = signal_dict
+
+    # arb_id_dict[given_arb_id * 256] = ArbID(given_arb_id * 256)
+
+    for k, arb_id in arb_id_dict.items():
+        # print(str(arb_id.id) + " == " + str(given_arb_id) + " ?\n")
+        if arb_id.id == given_arb_id:
+            arb_id.static = False
+            arb_id.short = False
+            if not arb_id.static:
+                for token in arb_id.tokenization:
+                    a_timer.start_iteration_time()
+
+                    signal = Signal(k * 256, token[0], token[1])
+                    signal.static = False
+
+
+
+                    # Convert the binary ndarray to a list of string representations of each row
+                    temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
+                    temp2 = zeros((temp1.__len__()+1), dtype=uint64)
+                    # convert each string representation to int
+                    for i, row in enumerate(temp1):
+                        temp2[i] = int(row, 2)
+
+                    temp3 = integrate.cumtrapz(temp2)
+                    print("Arb Id " + str(k) + ", Signal from " + str(token[0]) + " to  " + str(token[1]) + " Integrated successfully")
+
+
+
+                    # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+                    signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
+
+
+
+                    # Normalize the signal and update its meta-data
+                    signal.normalize_and_set_metadata(normalize_strategy)
+                    # add this signal to the signal dictionary which is keyed by Arbitration ID
+                    if (k * 256) in transform_dict:
+                        transform_dict[k * 256][(arb_id.id * 256, signal.start_index, signal.stop_index)] = signal
+                    else:
+                        print("Successfully added at transform dict")
+                        transform_dict[k * 256] = {(arb_id.id * 256, signal.start_index, signal.stop_index): signal}
+
+                    a_timer.set_token_to_signal()
+
+    a_timer.set_signal_generation()
+
+    return transform_dict
+
+
+def transform_signals(a_timer: PipelineTimer,
+                      arb_id_dict: dict,
+                      transform_pickle_filename: str,
+                      normalize_strategy,
+                      force=False):
+    if force and path.isfile(transform_pickle_filename):
+        remove(transform_pickle_filename)
+    if path.isfile(transform_pickle_filename):
+            print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
+            return load(open(transform_pickle_filename, "rb"))
+
+    a_timer.start_function_time()
+
+    transform_dict = {}  # arb_id_dict
+
+    for k, arb_id in arb_id_dict.items():
+        if not arb_id.static:
+            for token in arb_id.tokenization:
+                a_timer.start_iteration_time()
+
+                signal = Signal(k * 256, token[0], token[1])
+
+
+
+                # Convert the binary ndarray to a list of string representations of each row
+                temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
+                temp2 = zeros((temp1.__len__()+1), dtype=uint64)
+                # convert each string representation to int
+                for i, row in enumerate(temp1):
+                    temp2[i] = int(row, 2)
+
+                temp3 = integrate.cumtrapz(temp2)
+
+
+
+                # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+                signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
+
+
+
+                # Normalize the signal and update its meta-data
+                signal.normalize_and_set_metadata(normalize_strategy)
+                # add this signal to the signal dictionary which is keyed by Arbitration ID
+                if k in transform_dict:
+                    transform_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
+                else:
+                    transform_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
+
+                a_timer.set_token_to_signal()
+
+    a_timer.set_signal_generation()
+
+    return transform_dict
--- a/Pipeline_multi-file/LexicalAnalysis.py
+++ b/Pipeline_multi-file/LexicalAnalysis.py
--- a/Pipeline_multi-file/Main.py
+++ b/Pipeline_multi-file/Main.py
@ -5,14 +5,22 @@ from Sample import Sample
 # Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING?
 kfold_n: int = 5
 current_vehicle_number = 0
+known_speed_arb_id = 514

 good_boi = FileBoi()
 samples = good_boi.go_fetch(kfold_n)
 for key, sample_list in samples.items():  # type: tuple, list
    for sample in sample_list:  # type: Sample
-        print(current_vehicle_number)
+
+        # sample.tang_inversion_bit_dist += (0.01 * current_vehicle_number)
+        # sample.max_inter_cluster_dist += (0.01 * current_vehicle_number)
+        # sample.tang_inversion_bit_dist = round(sample.tang_inversion_bit_dist, 2)  # removes floating point errors
+        # sample.max_inter_cluster_dist = round(sample.max_inter_cluster_dist, 2)
+        # print("\n\t##### Settings are " + str(sample.tang_inversion_bit_dist) + " and " + str(
+            # sample.max_inter_cluster_dist) + " #####")
+
        print("\nData import and Pre-Processing for " + sample.output_vehicle_dir)
-        id_dict, j1979_dict = sample.pre_process()
+        id_dict, j1979_dict = sample.pre_process(known_speed_arb_id)
        if j1979_dict:
            sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number))

@ -25,14 +33,22 @@ for key, sample_list in samples.items():  # type: tuple, list
        print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
        sample.tokenize_dictionary(id_dict)
        signal_dict = sample.generate_signals(id_dict, bool(j1979_dict))
-        sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
+        # sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))

-        #                 LEXICAL ANALYSIS                     #
+        #                 KNOWN SIGNAL ANALYSIS                  #
+        print("\n\t##### BEGINNING KNOWN SIGNAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
+        transform_dict= sample.transform_signal(id_dict, signal_dict, known_speed_arb_id)
+        sample.plot_arb_ids(id_dict, transform_dict, vehicle_number=str(current_vehicle_number))
+
+
+        #                 SEMANTIC ANALYSIS                     #
        print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####")
-        corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict)
+        corr_matrix, combined_df = sample.generate_correlation_matrix(transform_dict)
        if j1979_dict:
-            signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df)
+            transform_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, transform_dict, combined_df)
        cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix)
-        sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
+        # sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
+        sample.plot_known_signal_cluster(cluster_dict, signal_dict, bool(j1979_dict), known_speed_arb_id, vehicle_number=str(current_vehicle_number))
        sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number))
        current_vehicle_number += 1
+
--- a/Pipeline_multi-file/PipelineTimer.py
+++ b/Pipeline_multi-file/PipelineTimer.py
--- a/Pipeline_multi-file/Plotter.py
+++ b/Pipeline_multi-file/Plotter.py
@ -25,13 +25,13 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
            rmtree(arb_id_folder)
        else:
            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
-            return
+            # return

    a_timer.start_function_time()

    for k_id, signals in signal_dict.items():
        arb_id = arb_id_dict[k_id]
-        if not arb_id.static and not arb_id.short:
+        if (not arb_id.static and not arb_id.short) or k_id == 155136:
            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
            a_timer.start_iteration_time()

@ -85,7 +85,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
            chdir(arb_id_folder)

            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
-            savefig(hex(arb_id.id) + "." + figure_format,
+            savefig(hex(signal.arb_id) + "." + figure_format,
                    bbox_iches='tight',
                    pad_inches=0.0,
                    dpi=figure_dpi,
@ -311,3 +311,162 @@ def plot_dendrogram(a_timer: PipelineTimer,
            transparent=figure_transp)
    plt.close()
    print("\t\tComplete...")
+
+
+def plot_known_signal_cluster(a_timer: PipelineTimer,
+                              cluster_dict: dict,
+                              signal_dict: dict,
+                              use_j1979_tags: bool,
+                              vehicle_number: str,
+                              given_arb_id: int,
+                              force: bool = False):
+    if path.exists(cluster_folder):
+        if force:
+            rmtree(cluster_folder)
+        else:
+            print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    print("\n")
+    for cluster_number, list_of_signals in cluster_dict.items():
+        if [v for i, v in enumerate(list_of_signals) if (v[0] == given_arb_id or v[0] == given_arb_id * 256)]:
+            print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
+            a_timer.start_iteration_time()
+
+            # Setup the plot
+            fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
+            plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number,
+                         weight='bold',
+                         position=(0.5, 1))
+            fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
+
+            size_adjust = len(list_of_signals) / 100
+            # The min() statement provides whitespace for the suptitle depending on the number of subplots.
+            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+            # This adjusts whitespace padding on the left and right of the subplots
+            fig.subplots_adjust(left=0.07, right=0.98)
+
+            # Plot the time series of each signal in the cluster
+            for i, signal_key in enumerate(list_of_signals):
+                signal = signal_dict[signal_key[0]][signal_key]
+                ax = axes[i, 0]
+                if signal.j1979_title and use_j1979_tags:
+                    this_title = signal.plot_title + " [" + signal.j1979_title + \
+                                 " (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
+                else:
+                    this_title = signal.plot_title
+                ax.set_title(this_title,
+                             style='italic',
+                             size='medium')
+                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+                ax.plot(signal.time_series, color='black')
+
+            if not path.exists(cluster_folder):
+                mkdir(cluster_folder)
+            chdir(cluster_folder)
+
+            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+            if len(list_of_signals) < 100:  # prevents errors when given too low a setting for correlation
+                savefig("cluster_" + str(cluster_number) + "." + figure_format,
+                        bbox_iches='tight',
+                        pad_inches=0.0,
+                        dpi=figure_dpi,
+                        format=figure_format,
+                        transparent=figure_transp)
+            else:
+                print("Too many clusters to plot! Skipping...")
+
+            chdir("..")
+
+            plt.close(fig)
+
+            a_timer.set_plot_save_cluster()
+            print("\tComplete...")
+
+    a_timer.set_plot_save_cluster_dict()
+
+
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str,
+                           force: bool=False):
+    if path.exists(arb_id_folder):
+        if force:
+            rmtree(arb_id_folder)
+        else:
+            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
+            # return
+
+    a_timer.start_function_time()
+
+    for k_id, signals in signal_dict.items():
+        arb_id = arb_id_dict[k_id]
+        if (not arb_id.static and not arb_id.short) or k_id == 155136:
+            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
+            a_timer.start_iteration_time()
+
+            signals_to_plot = []
+            # Don't plot the static signals
+            for k_signal, signal in signals.items():
+                if not signal.static:
+                    signals_to_plot.append(signal)
+            # There's a corner case where the Arb ID only has static signals. This conditional accounts for this.
+            # TODO: This corner case should probably be reflected by arb_id.static.
+            if len(signals_to_plot) < 1:
+                continue
+            # One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors.
+            fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
+            plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number,
+                         weight='bold',
+                         position=(0.5, 1))
+            fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
+            # The min() statement provides whitespace for the title depending on the number of subplots.
+            size_adjust = len(signals_to_plot) / 100
+            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+            # This adjusts whitespace padding on the left and right of the subplots
+            fig.subplots_adjust(left=0.07, right=0.98)
+            for i, signal in enumerate(signals_to_plot):
+                ax = axes[i]
+                ax.set_title(signal.plot_title,
+                             style='italic',
+                             size='medium')
+                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+                ax.plot(signal.time_series, color='black')
+                # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
+                axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
+
+            # Plot the entropy gradient at the bottom of the overall output
+            ax = axes[-1]
+            ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
+                         style='italic',
+                         size='medium')
+            tang_bit_width = arb_id.tang.shape[0]
+            ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
+            y = arb_id.tang[:]
+            # Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
+            ix = isin(y, 0)
+            pad_bit = where(ix)
+            non_pad_bit = where(~ix)
+            ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
+            ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
+
+            if not path.exists(arb_id_folder):
+                mkdir(arb_id_folder)
+            chdir(arb_id_folder)
+
+            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+            savefig(hex(signal.arb_id) + "." + figure_format,
+                    bbox_iches='tight',
+                    pad_inches=0.0,
+                    dpi=figure_dpi,
+                    format=figure_format,
+                    transparent=figure_transp)
+
+            chdir("..")
+
+            plt.close(fig)
+
+            a_timer.set_plot_save_arb_id()
+            print("\tComplete...")
+
+    a_timer.set_plot_save_arb_id_dict()
--- a/Pipeline_multi-file/PreProcessor.py
+++ b/Pipeline_multi-file/PreProcessor.py
@ -1,4 +1,4 @@
-from pandas import DataFrame, read_csv, Series
+from pandas import DataFrame, read_csv, Series, concat
 from numpy import int64
 from os import path, remove, getcwd
 from pickle import load
@ -45,7 +45,7 @@ class PreProcessor:
                             header=None,
                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
                             skiprows=7,
-                             delimiter='\t',
+                             delimiter=' ',
                             converters=convert_dict,
                             index_col=0)

@ -70,6 +70,7 @@ class PreProcessor:
                                   time_conversion:             int = 1000,
                                   freq_analysis_accuracy:      float = 0.0,
                                   freq_synchronous_threshold:  float = 0.0,
+                                   given_arb_id:                int = 0,
                                   force:                       bool = False) -> (dict, dict):
        id_dictionary = {}
        j1979_dictionary = {}
@ -92,6 +93,11 @@ class PreProcessor:
            return id_dictionary, j1979_dictionary
        else:
            self.import_csv(a_timer, self.data_filename)
+            this_id = self.data.loc[self.data['id'] == given_arb_id].copy()
+            this_id.id = given_arb_id * 256
+
+            combined = concat([self.data, this_id])
+            self.data = combined

        a_timer.start_function_time()

--- a/Pipeline_multi-file/Sample.py
+++ b/Pipeline_multi-file/Sample.py
@ -2,7 +2,7 @@ from PreProcessor import PreProcessor
 from Validator import Validator
 from LexicalAnalysis import tokenize_dictionary, generate_signals
 from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling
-from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram
+from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram, plot_known_signal_cluster
 from sklearn.preprocessing import minmax_scale
 from typing import Callable
 from PipelineTimer import PipelineTimer
@ -11,6 +11,8 @@ from pickle import dump, load
 from numpy import ndarray, zeros, float16
 from pandas import DataFrame

+from KnownSignalAnalysis import transform_signals, transform_signal
+
 # File names for the on-disc data input and output.
 output_folder:              str = 'output'
 pickle_arb_id_filename:     str = 'pickleArbIDs.p'
@ -26,6 +28,8 @@ pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p'
 csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
 pickle_timer_filename:      str = 'pickleTimer.p'

+pickle_transform_filename:  str = 'pickleTransform'
+
 dump_to_pickle:             bool = True

 # Change out the normalization strategies as needed.
@ -39,9 +43,11 @@ force_threshold_plotting:   bool = False
 force_j1979_plotting:       bool = True
 use_j1979:                  bool = True

+force_transform:            bool = False
+
 force_lexical_analysis:     bool = False
 force_signal_generation:    bool = False
-force_arb_id_plotting:      bool = True
+force_arb_id_plotting:      bool = False

 force_correlation_matrix:   bool = False
 force_clustering:           bool = False
@ -58,16 +64,15 @@ freq_synchronous_threshold = 0.1

 # Threshold parameters used during lexical analysis.
 tokenization_bit_distance:  float = 0.2
-tokenize_padding:           bool = True
+tokenize_padding:           bool = False  # changing this to false seems to help better find weak signals
 merge_tokens:               bool = True

 # Threshold parameters used during semantic analysis
 subset_selection_size:      float = 0.25
-max_intra_cluster_distance: float = 0.20
+max_intra_cluster_distance: float = 0.10  # normally 0.25
 min_j1979_correlation:      float = 0.85
 # fuzzy_labeling:             bool = True

-
 # A timer class to record timings throughout the pipeline.
 a_timer = PipelineTimer(verbose=True)

@ -112,7 +117,7 @@ class Sample:
        # Move back to root of './output/make_model_year/sample_index/"
        chdir("../../../")

-    def pre_process(self):
+    def pre_process(self, given_arb_id):
        self.make_and_move_to_vehicle_directory()
        pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979)
        id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
@ -120,6 +125,7 @@ class Sample:
                                                                                   time_conversion,
                                                                                   freq_analysis_accuracy,
                                                                                   freq_synchronous_threshold,
+                                                                                   given_arb_id,
                                                                                   force_pre_processing)
        if dump_to_pickle:
            if force_pre_processing:
@ -303,3 +309,37 @@ class Sample:
        plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist,
                        vehicle_number=vehicle_number, force=force_dendrogram_plotting)
        self.move_back_to_parent_directory()
+
+    def transform_signals(self, id_dictionary: dict):
+        self.make_and_move_to_vehicle_directory()
+        transform_dict = transform_signals(a_timer=a_timer,
+                                           arb_id_dict=id_dictionary,
+                                           transform_pickle_filename=pickle_transform_filename,
+                                           normalize_strategy=signal_normalize_strategy,
+                                           force=force_transform)
+        self.move_back_to_parent_directory()
+        return transform_dict
+
+    def transform_signal(self, id_dictionary: dict, signal_dict: dict, arb_id: int):
+        self.make_and_move_to_vehicle_directory()
+        transform_dict = transform_signal(a_timer=a_timer,
+                                          arb_id_dict=id_dictionary,
+                                          signal_dict=signal_dict,
+                                          transform_pickle_filename=pickle_transform_filename,
+                                          normalize_strategy=signal_normalize_strategy,
+                                          given_arb_id=arb_id,
+                                          force=force_transform)
+        self.move_back_to_parent_directory()
+        return transform_dict
+
+    def plot_known_signal_cluster(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool,
+                      known_signal: int, vehicle_number: str):
+        self.make_and_move_to_vehicle_directory()
+        plot_known_signal_cluster(a_timer=a_timer,
+                                cluster_dict=cluster_dictionary,
+                                signal_dict=signal_dictionary,
+                                use_j1979_tags=use_j1979_tags,
+                                vehicle_number=vehicle_number,
+                                given_arb_id=known_signal,
+                                force=force_cluster_plotting)
+        self.move_back_to_parent_directory()
--- a/Pipeline_multi-file/SemanticAnalysis.py
+++ b/Pipeline_multi-file/SemanticAnalysis.py
@ -1,5 +1,5 @@
 from pandas import concat, DataFrame, read_csv
-from numpy import ndarray, zeros
+from numpy import ndarray, zeros, clip
 from os import path, remove
 from pickle import load, dump
 from ast import literal_eval
@ -77,7 +77,7 @@ def signal_clustering(corr_matrix:      DataFrame,
    corr_matrix.where(corr_matrix > 0, 0, inplace=True)
    corr_matrix = 1 - corr_matrix
    X = corr_matrix.values  # type: ndarray
-    Y = ssd.squareform(X)
+    Y = clip(ssd.squareform(X), 0, None)
    # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
    Z = linkage(Y, method='single', optimal_ordering=True)
    fclus = fcluster(Z, t=threshold, criterion='distance')
--- a/Pipeline_multi-file/Signal.py
+++ b/Pipeline_multi-file/Signal.py
--- a/Pipeline_multi-file/Validator.py
+++ b/Pipeline_multi-file/Validator.py
--- a/Pipeline_multi-file/maximize_sum_shannon.py
+++ b/Pipeline_multi-file/maximize_sum_shannon.py