First Commit. Details to follow; I'm still learning.

2019-08-27 13:35:21 -04:00 · 2019-08-27 13:35:21 -04:00 · 3749d4f303
parent ca184a56de
commit 3749d4f303
29 changed files with 608 additions and 197 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,2 @@
+# Default ignored files
+/workspace.xml
--- a/.idea/CAN_Reverse_Engineering_git.iml
+++ b/.idea/CAN_Reverse_Engineering_git.iml
@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/Pipeline_multi-file" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/Example_Pipeline_Output" />
+      <excludeFolder url="file://$MODULE_DIR$/Pipeline" />
+      <excludeFolder url="file://$MODULE_DIR$/R" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.7 (CAN_Reverse_Engineering)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (CAN_Reverse_Engineering)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/CAN_Reverse_Engineering_git.iml" filepath="$PROJECT_DIR$/.idea/CAN_Reverse_Engineering_git.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/Pipeline/ArbID.py
+++ b/Pipeline/ArbID.py
--- a/Pipeline/J1979.py
+++ b/Pipeline/J1979.py
--- a/Pipeline/LexicalAnalysis.py
+++ b/Pipeline/LexicalAnalysis.py
--- a/Pipeline/Main.py
+++ b/Pipeline/Main.py
@ -9,190 +9,202 @@ from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal
 from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster
 from PipelineTimer import PipelineTimer

+i = 0
+j = 0
 # File names for the on-disc data input and output.
 # Input:
-can_data_filename:          str = 'drive_runway_afit.log'
-# can_data_filename:          str = 'loggerProgram0.log'
+#can_data_filename:          str = 'drive_runway_afit.log'
+can_data_filename:          str = 'loggerProgram0.log'

-# Output:
-output_folder:              str = 'output'
-pickle_arb_id_filename:     str = 'pickleArbIDs.p'
-pickle_j1979_filename:      str = 'pickleJ1979.p'
-pickle_signal_filename:     str = 'pickleSignals.p'
-pickle_subset_filename:     str = 'pickleSubset.p'
-csv_correlation_filename:   str = 'subset_correlation_matrix.csv'
-pickle_j1979_correlation:   str = 'pickleJ1979_correlation.p'
-pickle_clusters_filename:   str = 'pickleClusters.p'
-pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
-csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
-pickle_timer_filename:      str = 'pickleTimer.p'
+while i < 51:
+    if i == 50 and j < 50: #i need to optimize this and redesign it
+        j += 1
+        i = 0
+    elif i == 50 and j == 50:
+        i = 51
+    else:
+        i += 1
+    # Output:
+    output_folder:              str = 'output'
+    pickle_arb_id_filename:     str = 'pickleArbIDs.p'
+    pickle_j1979_filename:      str = 'pickleJ1979.p'
+    pickle_signal_filename:     str = 'pickleSignals.p'
+    pickle_subset_filename:     str = 'pickleSubset.p'
+    csv_correlation_filename:   str = 'subset_correlation_matrix.csv'
+    pickle_j1979_correlation:   str = 'pickleJ1979_correlation.p'
+    pickle_clusters_filename:   str = 'pickleClusters.p'
+    pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
+    csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
+    pickle_timer_filename:      str = 'pickleTimer.p'

-# Change out the normalization strategies as needed.
-tang_normalize_strategy:    Callable = minmax_scale
-signal_normalize_strategy:  Callable = minmax_scale
+    # Change out the normalization strategies as needed.
+    tang_normalize_strategy:    Callable = minmax_scale
+    signal_normalize_strategy:  Callable = minmax_scale

-# Turn on or off portions of the pipeline and output methods using these flags.
-force_pre_processing:       bool = False
-force_j1979_plotting:       bool = False
+    # Turn on or off portions of the pipeline and output methods using these flags.
+    force_pre_processing:       bool = False
+    force_j1979_plotting:       bool = True

-force_lexical_analysis:     bool = False
-force_arb_id_plotting:      bool = True
+    force_lexical_analysis:     bool = True
+    force_arb_id_plotting:      bool = True

-force_semantic_analysis:    bool = False
-force_signal_labeling:      bool = False
-use_j1979_tags_in_plots:    bool = True
-force_cluster_plotting:     bool = False
+    force_semantic_analysis:    bool = True
+    force_signal_labeling:      bool = True
+    use_j1979_tags_in_plots:    bool = True
+    force_cluster_plotting:     bool = True

-dump_to_pickle:             bool = True
+    dump_to_pickle:             bool = True

-# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
-time_conversion = 1000  # convert seconds to milliseconds
-z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
-freq_analysis_accuracy = z_lookup[0.9]
-freq_synchronous_threshold = 0.1
+    # Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
+    time_conversion = 1000  # convert seconds to milliseconds
+    z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
+    freq_analysis_accuracy = z_lookup[0.9]
+    freq_synchronous_threshold = 0.1

-# Threshold parameters used during lexical analysis.
-tokenization_bit_distance:  float = 0.2
-tokenize_padding:           bool = True
-
-# Threshold parameters used during semantic analysis
-subset_selection_size:      float = 0.25
-fuzzy_labeling:             bool = True
-min_correlation_threshold:  float = 0.85
-
-# A timer class to record timings throughout the pipeline.
-a_timer = PipelineTimer(verbose=True)
-
-#            DATA IMPORT AND PRE-PROCESSING             #
-pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
-id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
-                                                                           tang_normalize_strategy,
-                                                                           time_conversion,
-                                                                           freq_analysis_accuracy,
-                                                                           freq_synchronous_threshold,
-                                                                           force_pre_processing)
-if j1979_dictionary:
-    plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
+    # Threshold parameters used during lexical analysis. Default is 0.2
+    tokenization_bit_distance:  float = i/100
+    tokenize_padding:           bool = True


-#                 LEXICAL ANALYSIS                     #
-print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
-tokenize_dictionary(a_timer,
-                    id_dictionary,
-                    force_lexical_analysis,
-                    include_padding=tokenize_padding,
-                    merge=True,
-                    max_distance=tokenization_bit_distance)
-signal_dictionary = generate_signals(a_timer,
-                                     id_dictionary,
-                                     pickle_signal_filename,
-                                     signal_normalize_strategy,
-                                     force_lexical_analysis)
-plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting)

-#                  SEMANTIC ANALYSIS                    #
-print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
-subset_df = subset_selection(a_timer,
-                             signal_dictionary,
-                             pickle_subset_filename,
-                             force_semantic_analysis,
-                             subset_size=subset_selection_size)
-corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
-cluster_dict = greedy_signal_clustering(corr_matrix_subset,
-                                        correlation_threshold=min_correlation_threshold,
-                                        fuzzy_labeling=fuzzy_labeling)
-df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
-                                                            pickle_clusters_filename=pickle_clusters_filename,
-                                                            pickle_all_signals_df_filename=pickle_all_signal_filename,
-                                                            csv_signals_correlation_filename=csv_all_signals_filename,
-                                                            signal_dict=signal_dictionary,
-                                                            cluster_dict=cluster_dict,
-                                                            correlation_threshold=min_correlation_threshold,
-                                                            force=force_semantic_analysis)
-signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
-                                                              j1979_corr_filename=pickle_j1979_correlation,
-                                                              df_signals=df_full,
-                                                              j1979_dict=j1979_dictionary,
-                                                              signal_dict=signal_dictionary,
-                                                              correlation_threshold=min_correlation_threshold,
-                                                              force=force_signal_labeling)
-plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting)
+    # Threshold parameters used during semantic analysis Default is 0.25 and 0.85
+    subset_selection_size:      float = j/100
+    fuzzy_labeling:             bool = True
+    min_correlation_threshold:  float = 0.85

-#                     DATA STORAGE                      #
-if dump_to_pickle:
-    if force_pre_processing:
-        if path.isfile(pickle_arb_id_filename):
-            remove(pickle_arb_id_filename)
-        if path.isfile(pickle_j1979_filename):
-            remove(pickle_j1979_filename)
-    if force_lexical_analysis or force_signal_labeling:
-        if path.isfile(pickle_signal_filename):
-            remove(pickle_signal_filename)
-    if force_semantic_analysis:
-        if path.isfile(pickle_subset_filename):
-            remove(pickle_subset_filename)
-        if path.isfile(csv_correlation_filename):
-            remove(csv_correlation_filename)
-        if path.isfile(pickle_j1979_correlation):
-            remove(pickle_j1979_correlation)
-        if path.isfile(pickle_clusters_filename):
-            remove(pickle_clusters_filename)
-        if path.isfile(pickle_all_signal_filename):
-            remove(pickle_all_signal_filename)
-        if path.isfile(csv_all_signals_filename):
-            remove(csv_all_signals_filename)
+    # A timer class to record timings throughout the pipeline.
+    a_timer = PipelineTimer(verbose=True)

-    timer_flag = 0
-    if not path.exists(output_folder):
-        mkdir(output_folder)
-    chdir(output_folder)
-    if not path.isfile(pickle_arb_id_filename):
-        timer_flag += 1
-        print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
-        dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
-        print("\tComplete...")
-    if not path.isfile(pickle_j1979_filename):
-        timer_flag += 1
-        print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
-        dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
-        print("\tComplete...")
-    if not path.isfile(pickle_signal_filename):
-        timer_flag += 1
-        print("\nDumping signal dictionary to " + pickle_signal_filename)
-        dump(signal_dictionary, open(pickle_signal_filename, "wb"))
-        print("\tComplete...")
-    if not path.isfile(pickle_subset_filename):
-        timer_flag += 1
-        print("\nDumping signal subset list to " + pickle_subset_filename)
-        dump(subset_df, open(pickle_subset_filename, "wb"))
-        print("\tComplete...")
-    if not path.isfile(csv_correlation_filename):
-        timer_flag += 1
-        print("\nDumping subset correlation matrix to " + csv_correlation_filename)
-        corr_matrix_subset.to_csv(csv_correlation_filename)
-        print("\tComplete...")
-    if not path.isfile(pickle_j1979_correlation):
-        timer_flag += 1
-        print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
-        dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
-        print("\tComplete...")
-    if not path.isfile(pickle_clusters_filename):
-        timer_flag += 1
-        print("\nDumping cluster dictionary to " + pickle_clusters_filename)
-        dump(cluster_dict, open(pickle_clusters_filename, "wb"))
-        print("\tComplete...")
-    if not path.isfile(pickle_all_signal_filename):
-        timer_flag += 1
-        print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
-        dump(df_full, open(pickle_all_signal_filename, "wb"))
-        print("\tComplete...")
-    if not path.isfile(csv_all_signals_filename):
-        timer_flag += 1
-        print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
-        corr_matrix_full.to_csv(csv_all_signals_filename)
-        print("\tComplete...")
-    if timer_flag is 9:
-        print("\nDumping pipeline timer to " + pickle_timer_filename)
-        dump(a_timer, open(pickle_timer_filename, "wb"))
-        print("\tComplete...")
-    chdir("..")
+    #            DATA IMPORT AND PRE-PROCESSING             #
+    pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
+    id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
+                                                                               tang_normalize_strategy,
+                                                                               time_conversion,
+                                                                               freq_analysis_accuracy,
+                                                                               freq_synchronous_threshold,
+                                                                               force_pre_processing)
+    if j1979_dictionary:
+        plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
+
+
+    #                 LEXICAL ANALYSIS                     #
+    print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
+    tokenize_dictionary(a_timer,
+                        id_dictionary,
+                        force_lexical_analysis,
+                        include_padding=tokenize_padding,
+                        merge=True,
+                        max_distance=tokenization_bit_distance)
+    signal_dictionary = generate_signals(a_timer,
+                                         id_dictionary,
+                                         pickle_signal_filename,
+                                         signal_normalize_strategy,
+                                         force_lexical_analysis)
+    plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, i, force_arb_id_plotting)
+
+    #                  SEMANTIC ANALYSIS                    #
+    print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
+    subset_df = subset_selection(a_timer,
+                                 signal_dictionary,
+                                 pickle_subset_filename,
+                                 force_semantic_analysis,
+                                 subset_size=subset_selection_size)
+    corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
+    cluster_dict = greedy_signal_clustering(corr_matrix_subset,
+                                            correlation_threshold=min_correlation_threshold,
+                                            fuzzy_labeling=fuzzy_labeling)
+    df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
+                                                                pickle_clusters_filename=pickle_clusters_filename,
+                                                                pickle_all_signals_df_filename=pickle_all_signal_filename,
+                                                                csv_signals_correlation_filename=csv_all_signals_filename,
+                                                                signal_dict=signal_dictionary,
+                                                                cluster_dict=cluster_dict,
+                                                                correlation_threshold=min_correlation_threshold,
+                                                                force=force_semantic_analysis)
+    signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
+                                                                  j1979_corr_filename=pickle_j1979_correlation,
+                                                                  df_signals=df_full,
+                                                                  j1979_dict=j1979_dictionary,
+                                                                  signal_dict=signal_dictionary,
+                                                                  correlation_threshold=min_correlation_threshold,
+                                                                  force=force_signal_labeling)
+    plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, i, force_cluster_plotting)
+
+    #                     DATA STORAGE                      #
+    if dump_to_pickle:
+        if force_pre_processing:
+            if path.isfile(pickle_arb_id_filename):
+                remove(pickle_arb_id_filename)
+            if path.isfile(pickle_j1979_filename):
+                remove(pickle_j1979_filename)
+        if force_lexical_analysis or force_signal_labeling:
+            if path.isfile(pickle_signal_filename):
+                remove(pickle_signal_filename)
+        if force_semantic_analysis:
+            if path.isfile(pickle_subset_filename):
+                remove(pickle_subset_filename)
+            if path.isfile(csv_correlation_filename):
+                remove(csv_correlation_filename)
+            if path.isfile(pickle_j1979_correlation):
+                remove(pickle_j1979_correlation)
+            if path.isfile(pickle_clusters_filename):
+                remove(pickle_clusters_filename)
+            if path.isfile(pickle_all_signal_filename):
+                remove(pickle_all_signal_filename)
+            if path.isfile(csv_all_signals_filename):
+                remove(csv_all_signals_filename)
+
+        timer_flag = 0
+        if not path.exists(output_folder):
+            mkdir(output_folder)
+        chdir(output_folder)
+        if not path.isfile(pickle_arb_id_filename):
+            timer_flag += 1
+            print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
+            dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_j1979_filename):
+            timer_flag += 1
+            print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
+            dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_signal_filename):
+            timer_flag += 1
+            print("\nDumping signal dictionary to " + pickle_signal_filename)
+            dump(signal_dictionary, open(pickle_signal_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_subset_filename):
+            timer_flag += 1
+            print("\nDumping signal subset list to " + pickle_subset_filename)
+            dump(subset_df, open(pickle_subset_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(csv_correlation_filename):
+            timer_flag += 1
+            print("\nDumping subset correlation matrix to " + csv_correlation_filename)
+            corr_matrix_subset.to_csv(csv_correlation_filename)
+            print("\tComplete...")
+        if not path.isfile(pickle_j1979_correlation):
+            timer_flag += 1
+            print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
+            dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_clusters_filename):
+            timer_flag += 1
+            print("\nDumping cluster dictionary to " + pickle_clusters_filename)
+            dump(cluster_dict, open(pickle_clusters_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_all_signal_filename):
+            timer_flag += 1
+            print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
+            dump(df_full, open(pickle_all_signal_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(csv_all_signals_filename):
+            timer_flag += 1
+            print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
+            corr_matrix_full.to_csv(csv_all_signals_filename)
+            print("\tComplete...")
+        if timer_flag is 9:
+            print("\nDumping pipeline timer to " + pickle_timer_filename)
+            dump(a_timer, open(pickle_timer_filename, "wb"))
+            print("\tComplete...")
+        chdir("..")
--- a/Pipeline/PipelineTimer.py
+++ b/Pipeline/PipelineTimer.py
--- a/Pipeline/Plotter.py
+++ b/Pipeline/Plotter.py
@ -16,7 +16,10 @@ cluster_folder: str = 'clusters'
 j1979_folder:   str = 'j1979'


-def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False):
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, settings: int, force: bool = False):
+    arb_id_folder = 'figures' + str(settings)
+
+
    if path.exists(arb_id_folder):
        if force:
            rmtree(arb_id_folder)
@ -29,7 +32,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
    for k_id, signals in signal_dict.items():
        arb_id = arb_id_dict[k_id]
        if not arb_id.static:
-            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
+            print(str(settings) + "Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
            a_timer.start_iteration_time()

            signals_to_plot = []
@ -99,7 +102,9 @@ def plot_signals_by_cluster(a_timer: PipelineTimer,
                            cluster_dict: dict,
                            signal_dict: dict,
                            use_j1979_tags: bool,
+                            settings: int,
                            force: bool=False):
+    cluster_folder = 'cluster' + str(settings)
    if path.exists(cluster_folder):
        if force:
            rmtree(cluster_folder)
--- a/Pipeline/PreProcessor.py
+++ b/Pipeline/PreProcessor.py
@ -44,7 +44,7 @@ class PreProcessor:
                             header=None,
                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
                             skiprows=7,
-                             delimiter='\t',
+                             delimiter=' ',
                             converters=convert_dict,
                             index_col=0)

--- a/Pipeline/SemanticAnalysis.py
+++ b/Pipeline/SemanticAnalysis.py
--- a/Pipeline/Signal.py
+++ b/Pipeline/Signal.py
--- a/Pipeline_multi-file/ArbID.py
+++ b/Pipeline_multi-file/ArbID.py
--- a/Pipeline_multi-file/FileBoi.py
+++ b/Pipeline_multi-file/FileBoi.py
@ -60,13 +60,14 @@ class FileBoi:
                    # Check if this file name matches the expected name for a CAN data sample. If so, create new Sample
                    m = re.match('loggerProgram[\d]+.log', file)
                    if m:
+                        i = 0
                        if not (make, model, year) in sample_dict:
                            sample_dict[(make, model, year)] = []
                        this_sample_index = str(len(sample_dict[(make, model, year)]))
                        this_sample = Sample(make=make, model=model, year=year, sample_index=this_sample_index,
-                                             sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n)
+                                            sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n)
                        sample_dict[(make, model, year)].append(this_sample)
-                current_vehicle = []
+                    current_vehicle = []
            else:
                if this_dir == "Captures":
                    continue
--- a/Pipeline_multi-file/J1979.py
+++ b/Pipeline_multi-file/J1979.py
--- a/Pipeline_multi-file/KnownSignalAnalysis.py
+++ b/Pipeline_multi-file/KnownSignalAnalysis.py
@ -0,0 +1,130 @@
+from numpy import float64, nditer, uint64, zeros, ndarray, inf
+from pandas import Series, DataFrame
+from os import path, remove
+from pickle import load
+from ArbID import ArbID
+from Signal import Signal
+from PipelineTimer import PipelineTimer
+from typing import List
+from scipy import integrate
+
+
+def transform_signal(a_timer: PipelineTimer,
+                     arb_id_dict: dict,
+                     signal_dict: dict,
+                     transform_pickle_filename: str,
+                     normalize_strategy,
+                     given_arb_id: int,
+                     force=False):
+    if force and path.isfile(transform_pickle_filename):
+        remove(transform_pickle_filename)
+    if path.isfile(transform_pickle_filename):
+            print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
+            return load(open(transform_pickle_filename, "rb"))
+
+    a_timer.start_function_time()
+
+    transform_dict = signal_dict
+
+    # arb_id_dict[given_arb_id * 256] = ArbID(given_arb_id * 256)
+
+    for k, arb_id in arb_id_dict.items():
+        # print(str(arb_id.id) + " == " + str(given_arb_id) + " ?\n")
+        if arb_id.id == given_arb_id:
+            arb_id.static = False
+            arb_id.short = False
+            if not arb_id.static:
+                for token in arb_id.tokenization:
+                    a_timer.start_iteration_time()
+
+                    signal = Signal(k * 256, token[0], token[1])
+                    signal.static = False
+
+
+
+                    # Convert the binary ndarray to a list of string representations of each row
+                    temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
+                    temp2 = zeros((temp1.__len__()+1), dtype=uint64)
+                    # convert each string representation to int
+                    for i, row in enumerate(temp1):
+                        temp2[i] = int(row, 2)
+
+                    temp3 = integrate.cumtrapz(temp2)
+                    print("Arb Id " + str(k) + ", Signal from " + str(token[0]) + " to  " + str(token[1]) + " Integrated successfully")
+
+
+
+                    # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+                    signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
+
+
+
+                    # Normalize the signal and update its meta-data
+                    signal.normalize_and_set_metadata(normalize_strategy)
+                    # add this signal to the signal dictionary which is keyed by Arbitration ID
+                    if (k * 256) in transform_dict:
+                        transform_dict[k * 256][(arb_id.id * 256, signal.start_index, signal.stop_index)] = signal
+                    else:
+                        print("Successfully added at transform dict")
+                        transform_dict[k * 256] = {(arb_id.id * 256, signal.start_index, signal.stop_index): signal}
+
+                    a_timer.set_token_to_signal()
+
+    a_timer.set_signal_generation()
+
+    return transform_dict
+
+
+def transform_signals(a_timer: PipelineTimer,
+                      arb_id_dict: dict,
+                      transform_pickle_filename: str,
+                      normalize_strategy,
+                      force=False):
+    if force and path.isfile(transform_pickle_filename):
+        remove(transform_pickle_filename)
+    if path.isfile(transform_pickle_filename):
+            print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
+            return load(open(transform_pickle_filename, "rb"))
+
+    a_timer.start_function_time()
+
+    transform_dict = {}  # arb_id_dict
+
+    for k, arb_id in arb_id_dict.items():
+        if not arb_id.static:
+            for token in arb_id.tokenization:
+                a_timer.start_iteration_time()
+
+                signal = Signal(k * 256, token[0], token[1])
+
+
+
+                # Convert the binary ndarray to a list of string representations of each row
+                temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
+                temp2 = zeros((temp1.__len__()+1), dtype=uint64)
+                # convert each string representation to int
+                for i, row in enumerate(temp1):
+                    temp2[i] = int(row, 2)
+
+                temp3 = integrate.cumtrapz(temp2)
+
+
+
+                # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+                signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
+
+
+
+                # Normalize the signal and update its meta-data
+                signal.normalize_and_set_metadata(normalize_strategy)
+                # add this signal to the signal dictionary which is keyed by Arbitration ID
+                if k in transform_dict:
+                    transform_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
+                else:
+                    transform_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
+
+                a_timer.set_token_to_signal()
+
+    a_timer.set_signal_generation()
+
+    return transform_dict
--- a/Pipeline_multi-file/LexicalAnalysis.py
+++ b/Pipeline_multi-file/LexicalAnalysis.py
--- a/Pipeline_multi-file/Main.py
+++ b/Pipeline_multi-file/Main.py
@ -5,14 +5,22 @@ from Sample import Sample
 # Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING?
 kfold_n: int = 5
 current_vehicle_number = 0
+known_speed_arb_id = 514

 good_boi = FileBoi()
 samples = good_boi.go_fetch(kfold_n)
 for key, sample_list in samples.items():  # type: tuple, list
    for sample in sample_list:  # type: Sample
-        print(current_vehicle_number)
+
+        # sample.tang_inversion_bit_dist += (0.01 * current_vehicle_number)
+        # sample.max_inter_cluster_dist += (0.01 * current_vehicle_number)
+        # sample.tang_inversion_bit_dist = round(sample.tang_inversion_bit_dist, 2)  # removes floating point errors
+        # sample.max_inter_cluster_dist = round(sample.max_inter_cluster_dist, 2)
+        # print("\n\t##### Settings are " + str(sample.tang_inversion_bit_dist) + " and " + str(
+            # sample.max_inter_cluster_dist) + " #####")
+
        print("\nData import and Pre-Processing for " + sample.output_vehicle_dir)
-        id_dict, j1979_dict = sample.pre_process()
+        id_dict, j1979_dict = sample.pre_process(known_speed_arb_id)
        if j1979_dict:
            sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number))

@ -25,14 +33,22 @@ for key, sample_list in samples.items():  # type: tuple, list
        print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
        sample.tokenize_dictionary(id_dict)
        signal_dict = sample.generate_signals(id_dict, bool(j1979_dict))
-        sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
+        # sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))

-        #                 LEXICAL ANALYSIS                     #
+        #                 KNOWN SIGNAL ANALYSIS                  #
+        print("\n\t##### BEGINNING KNOWN SIGNAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
+        transform_dict= sample.transform_signal(id_dict, signal_dict, known_speed_arb_id)
+        sample.plot_arb_ids(id_dict, transform_dict, vehicle_number=str(current_vehicle_number))
+
+
+        #                 SEMANTIC ANALYSIS                     #
        print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####")
-        corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict)
+        corr_matrix, combined_df = sample.generate_correlation_matrix(transform_dict)
        if j1979_dict:
-            signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df)
+            transform_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, transform_dict, combined_df)
        cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix)
-        sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
+        # sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
+        sample.plot_known_signal_cluster(cluster_dict, signal_dict, bool(j1979_dict), known_speed_arb_id, vehicle_number=str(current_vehicle_number))
        sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number))
        current_vehicle_number += 1
+
--- a/Pipeline_multi-file/PipelineTimer.py
+++ b/Pipeline_multi-file/PipelineTimer.py
--- a/Pipeline_multi-file/Plotter.py
+++ b/Pipeline_multi-file/Plotter.py
@ -25,13 +25,13 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
            rmtree(arb_id_folder)
        else:
            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
-            return
+            # return

    a_timer.start_function_time()

    for k_id, signals in signal_dict.items():
        arb_id = arb_id_dict[k_id]
-        if not arb_id.static and not arb_id.short:
+        if (not arb_id.static and not arb_id.short) or k_id == 155136:
            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
            a_timer.start_iteration_time()

@ -85,7 +85,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
            chdir(arb_id_folder)

            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
-            savefig(hex(arb_id.id) + "." + figure_format,
+            savefig(hex(signal.arb_id) + "." + figure_format,
                    bbox_iches='tight',
                    pad_inches=0.0,
                    dpi=figure_dpi,
@ -311,3 +311,162 @@ def plot_dendrogram(a_timer: PipelineTimer,
            transparent=figure_transp)
    plt.close()
    print("\t\tComplete...")
+
+
+def plot_known_signal_cluster(a_timer: PipelineTimer,
+                              cluster_dict: dict,
+                              signal_dict: dict,
+                              use_j1979_tags: bool,
+                              vehicle_number: str,
+                              given_arb_id: int,
+                              force: bool = False):
+    if path.exists(cluster_folder):
+        if force:
+            rmtree(cluster_folder)
+        else:
+            print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    print("\n")
+    for cluster_number, list_of_signals in cluster_dict.items():
+        if [v for i, v in enumerate(list_of_signals) if (v[0] == given_arb_id or v[0] == given_arb_id * 256)]:
+            print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
+            a_timer.start_iteration_time()
+
+            # Setup the plot
+            fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
+            plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number,
+                         weight='bold',
+                         position=(0.5, 1))
+            fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
+
+            size_adjust = len(list_of_signals) / 100
+            # The min() statement provides whitespace for the suptitle depending on the number of subplots.
+            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+            # This adjusts whitespace padding on the left and right of the subplots
+            fig.subplots_adjust(left=0.07, right=0.98)
+
+            # Plot the time series of each signal in the cluster
+            for i, signal_key in enumerate(list_of_signals):
+                signal = signal_dict[signal_key[0]][signal_key]
+                ax = axes[i, 0]
+                if signal.j1979_title and use_j1979_tags:
+                    this_title = signal.plot_title + " [" + signal.j1979_title + \
+                                 " (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
+                else:
+                    this_title = signal.plot_title
+                ax.set_title(this_title,
+                             style='italic',
+                             size='medium')
+                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+                ax.plot(signal.time_series, color='black')
+
+            if not path.exists(cluster_folder):
+                mkdir(cluster_folder)
+            chdir(cluster_folder)
+
+            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+            if len(list_of_signals) < 100:  # prevents errors when given too low a setting for correlation
+                savefig("cluster_" + str(cluster_number) + "." + figure_format,
+                        bbox_iches='tight',
+                        pad_inches=0.0,
+                        dpi=figure_dpi,
+                        format=figure_format,
+                        transparent=figure_transp)
+            else:
+                print("Too many clusters to plot! Skipping...")
+
+            chdir("..")
+
+            plt.close(fig)
+
+            a_timer.set_plot_save_cluster()
+            print("\tComplete...")
+
+    a_timer.set_plot_save_cluster_dict()
+
+
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str,
+                           force: bool=False):
+    if path.exists(arb_id_folder):
+        if force:
+            rmtree(arb_id_folder)
+        else:
+            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
+            # return
+
+    a_timer.start_function_time()
+
+    for k_id, signals in signal_dict.items():
+        arb_id = arb_id_dict[k_id]
+        if (not arb_id.static and not arb_id.short) or k_id == 155136:
+            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
+            a_timer.start_iteration_time()
+
+            signals_to_plot = []
+            # Don't plot the static signals
+            for k_signal, signal in signals.items():
+                if not signal.static:
+                    signals_to_plot.append(signal)
+            # There's a corner case where the Arb ID only has static signals. This conditional accounts for this.
+            # TODO: This corner case should probably be reflected by arb_id.static.
+            if len(signals_to_plot) < 1:
+                continue
+            # One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors.
+            fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
+            plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number,
+                         weight='bold',
+                         position=(0.5, 1))
+            fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
+            # The min() statement provides whitespace for the title depending on the number of subplots.
+            size_adjust = len(signals_to_plot) / 100
+            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+            # This adjusts whitespace padding on the left and right of the subplots
+            fig.subplots_adjust(left=0.07, right=0.98)
+            for i, signal in enumerate(signals_to_plot):
+                ax = axes[i]
+                ax.set_title(signal.plot_title,
+                             style='italic',
+                             size='medium')
+                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+                ax.plot(signal.time_series, color='black')
+                # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
+                axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
+
+            # Plot the entropy gradient at the bottom of the overall output
+            ax = axes[-1]
+            ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
+                         style='italic',
+                         size='medium')
+            tang_bit_width = arb_id.tang.shape[0]
+            ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
+            y = arb_id.tang[:]
+            # Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
+            ix = isin(y, 0)
+            pad_bit = where(ix)
+            non_pad_bit = where(~ix)
+            ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
+            ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
+
+            if not path.exists(arb_id_folder):
+                mkdir(arb_id_folder)
+            chdir(arb_id_folder)
+
+            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+            savefig(hex(signal.arb_id) + "." + figure_format,
+                    bbox_iches='tight',
+                    pad_inches=0.0,
+                    dpi=figure_dpi,
+                    format=figure_format,
+                    transparent=figure_transp)
+
+            chdir("..")
+
+            plt.close(fig)
+
+            a_timer.set_plot_save_arb_id()
+            print("\tComplete...")
+
+    a_timer.set_plot_save_arb_id_dict()
--- a/Pipeline_multi-file/PreProcessor.py
+++ b/Pipeline_multi-file/PreProcessor.py
@ -1,4 +1,4 @@
-from pandas import DataFrame, read_csv, Series
+from pandas import DataFrame, read_csv, Series, concat
 from numpy import int64
 from os import path, remove, getcwd
 from pickle import load
@ -45,7 +45,7 @@ class PreProcessor:
                             header=None,
                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
                             skiprows=7,
-                             delimiter='\t',
+                             delimiter=' ',
                             converters=convert_dict,
                             index_col=0)

@ -70,6 +70,7 @@ class PreProcessor:
                                   time_conversion:             int = 1000,
                                   freq_analysis_accuracy:      float = 0.0,
                                   freq_synchronous_threshold:  float = 0.0,
+                                   given_arb_id:                int = 0,
                                   force:                       bool = False) -> (dict, dict):
        id_dictionary = {}
        j1979_dictionary = {}
@ -92,6 +93,11 @@ class PreProcessor:
            return id_dictionary, j1979_dictionary
        else:
            self.import_csv(a_timer, self.data_filename)
+            this_id = self.data.loc[self.data['id'] == given_arb_id].copy()
+            this_id.id = given_arb_id * 256
+
+            combined = concat([self.data, this_id])
+            self.data = combined

        a_timer.start_function_time()

--- a/Pipeline_multi-file/Sample.py
+++ b/Pipeline_multi-file/Sample.py
@ -2,7 +2,7 @@ from PreProcessor import PreProcessor
 from Validator import Validator
 from LexicalAnalysis import tokenize_dictionary, generate_signals
 from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling
-from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram
+from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram, plot_known_signal_cluster
 from sklearn.preprocessing import minmax_scale
 from typing import Callable
 from PipelineTimer import PipelineTimer
@ -11,6 +11,8 @@ from pickle import dump, load
 from numpy import ndarray, zeros, float16
 from pandas import DataFrame

+from KnownSignalAnalysis import transform_signals, transform_signal
+
 # File names for the on-disc data input and output.
 output_folder:              str = 'output'
 pickle_arb_id_filename:     str = 'pickleArbIDs.p'
@ -26,6 +28,8 @@ pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p'
 csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
 pickle_timer_filename:      str = 'pickleTimer.p'

+pickle_transform_filename:  str = 'pickleTransform'
+
 dump_to_pickle:             bool = True

 # Change out the normalization strategies as needed.
@ -39,9 +43,11 @@ force_threshold_plotting:   bool = False
 force_j1979_plotting:       bool = True
 use_j1979:                  bool = True

+force_transform:            bool = False
+
 force_lexical_analysis:     bool = False
 force_signal_generation:    bool = False
-force_arb_id_plotting:      bool = True
+force_arb_id_plotting:      bool = False

 force_correlation_matrix:   bool = False
 force_clustering:           bool = False
@ -58,16 +64,15 @@ freq_synchronous_threshold = 0.1

 # Threshold parameters used during lexical analysis.
 tokenization_bit_distance:  float = 0.2
-tokenize_padding:           bool = True
+tokenize_padding:           bool = False  # changing this to false seems to help better find weak signals
 merge_tokens:               bool = True

 # Threshold parameters used during semantic analysis
 subset_selection_size:      float = 0.25
-max_intra_cluster_distance: float = 0.20
+max_intra_cluster_distance: float = 0.10  # normally 0.25
 min_j1979_correlation:      float = 0.85
 # fuzzy_labeling:             bool = True

-
 # A timer class to record timings throughout the pipeline.
 a_timer = PipelineTimer(verbose=True)

@ -112,7 +117,7 @@ class Sample:
        # Move back to root of './output/make_model_year/sample_index/"
        chdir("../../../")

-    def pre_process(self):
+    def pre_process(self, given_arb_id):
        self.make_and_move_to_vehicle_directory()
        pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979)
        id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
@ -120,6 +125,7 @@ class Sample:
                                                                                   time_conversion,
                                                                                   freq_analysis_accuracy,
                                                                                   freq_synchronous_threshold,
+                                                                                   given_arb_id,
                                                                                   force_pre_processing)
        if dump_to_pickle:
            if force_pre_processing:
@ -303,3 +309,37 @@ class Sample:
        plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist,
                        vehicle_number=vehicle_number, force=force_dendrogram_plotting)
        self.move_back_to_parent_directory()
+
+    def transform_signals(self, id_dictionary: dict):
+        self.make_and_move_to_vehicle_directory()
+        transform_dict = transform_signals(a_timer=a_timer,
+                                           arb_id_dict=id_dictionary,
+                                           transform_pickle_filename=pickle_transform_filename,
+                                           normalize_strategy=signal_normalize_strategy,
+                                           force=force_transform)
+        self.move_back_to_parent_directory()
+        return transform_dict
+
+    def transform_signal(self, id_dictionary: dict, signal_dict: dict, arb_id: int):
+        self.make_and_move_to_vehicle_directory()
+        transform_dict = transform_signal(a_timer=a_timer,
+                                          arb_id_dict=id_dictionary,
+                                          signal_dict=signal_dict,
+                                          transform_pickle_filename=pickle_transform_filename,
+                                          normalize_strategy=signal_normalize_strategy,
+                                          given_arb_id=arb_id,
+                                          force=force_transform)
+        self.move_back_to_parent_directory()
+        return transform_dict
+
+    def plot_known_signal_cluster(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool,
+                      known_signal: int, vehicle_number: str):
+        self.make_and_move_to_vehicle_directory()
+        plot_known_signal_cluster(a_timer=a_timer,
+                                cluster_dict=cluster_dictionary,
+                                signal_dict=signal_dictionary,
+                                use_j1979_tags=use_j1979_tags,
+                                vehicle_number=vehicle_number,
+                                given_arb_id=known_signal,
+                                force=force_cluster_plotting)
+        self.move_back_to_parent_directory()
--- a/Pipeline_multi-file/SemanticAnalysis.py
+++ b/Pipeline_multi-file/SemanticAnalysis.py
@ -1,5 +1,5 @@
 from pandas import concat, DataFrame, read_csv
-from numpy import ndarray, zeros
+from numpy import ndarray, zeros, clip
 from os import path, remove
 from pickle import load, dump
 from ast import literal_eval
@ -77,7 +77,7 @@ def signal_clustering(corr_matrix:      DataFrame,
    corr_matrix.where(corr_matrix > 0, 0, inplace=True)
    corr_matrix = 1 - corr_matrix
    X = corr_matrix.values  # type: ndarray
-    Y = ssd.squareform(X)
+    Y = clip(ssd.squareform(X), 0, None)
    # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
    Z = linkage(Y, method='single', optimal_ordering=True)
    fclus = fcluster(Z, t=threshold, criterion='distance')
--- a/Pipeline_multi-file/Signal.py
+++ b/Pipeline_multi-file/Signal.py
--- a/Pipeline_multi-file/Validator.py
+++ b/Pipeline_multi-file/Validator.py
--- a/Pipeline_multi-file/maximize_sum_shannon.py
+++ b/Pipeline_multi-file/maximize_sum_shannon.py