Initial source code commit

2019-03-08 15:17:12 -05:00 · 2019-03-08 15:17:12 -05:00 · 94bf187ecd
parent 69ea19f719
commit 94bf187ecd
124 changed files with 276183 additions and 0 deletions
--- a/AFIT-END-DS-18-D-003.pdf
+++ b/AFIT-END-DS-18-D-003.pdf
--- a/Pipeline/ArbID.py
+++ b/Pipeline/ArbID.py
@ -0,0 +1,89 @@
+from typing import Callable, List
+from pandas import DataFrame
+from numpy import float64, logical_xor, mean, ndarray, sqrt, std, sum, uint8, zeros
+from PipelineTimer import PipelineTimer
+
+
+# noinspection PyArgumentList
+class ArbID:
+    def __init__(self, arb_id: int):
+        self.id:                int = arb_id
+        # These features are set by PreProcessing.py's generate_arb_id_dictionary
+        self.dlc:               int = 0
+        self.original_data:     DataFrame = None
+        # These features are set in generate_binary_matrix_and_tang called by generate_arb_id_dictionary
+        self.boolean_matrix:    ndarray = None
+        self.tang:              ndarray = None
+        self.static:            bool = True
+        # These features are set in analyze_transmission_frequency called by generate_arb_id_dictionary
+        self.ci_sensitivity:    float = 0.0
+        self.freq_mean:         float = 0.0
+        self.freq_std:          float = 0.0
+        self.freq_ci:           tuple = None
+        self.mean_to_ci_ratio:  float = 0.0
+        self.synchronous:       bool = False
+        # These features are set by LexicalAnalysis.py's get_composition
+        self.tokenization:      List[tuple] = []
+        self.padding:           List[int] = []
+
+    def generate_binary_matrix_and_tang(self, a_timer: PipelineTimer, normalize_strategy: Callable):
+        a_timer.start_nested_function_time()
+
+        self.boolean_matrix = zeros((self.original_data.__len__(), self.dlc * 8), dtype=uint8)
+
+        for i, row in enumerate(self.original_data.itertuples()):
+            for j, cell in enumerate(row[1:]):
+                # Skip cells that were already 0
+                if cell > 0:
+                    # i is the row in the boolean_matrix
+                    # j*8 is the left hand bit for this byte in the payload
+                    # j*8 + 8 is the right hand bit for this byte in the payload
+                    # e.g. byte index 1 starts at bits 1*8 = 8 to 1*8+8 = 16; [8:16]
+                    # likewise, byte index 7 starts at bits 7*8 = 56 to 7*8+8 = 64
+                    # Numpy indexing is non-inclusive of the upper bound. So [0:8] is the first 7 elements
+                    bin_string = format(cell, '08b')
+                    self.boolean_matrix[i, j * 8:j * 8 + 8] = [x == '1' for x in bin_string]
+
+        a_timer.set_hex_to_bool_matrix()
+
+        if self.boolean_matrix.shape[0] > 1:
+            a_timer.start_nested_function_time()
+
+            transition_matrix = logical_xor(self.boolean_matrix[:-1, ], self.boolean_matrix[1:, ])
+            self.tang = sum(transition_matrix, axis=0, dtype=float64)
+            # Ensure there is no divide by zero issues
+            if max(self.tang) > 0:
+                normalize_strategy(self.tang, axis=0, copy=False)
+                self.static = False
+
+            a_timer.set_bool_matrix_to_tang()
+
+    def analyze_transmission_frequency(self,
+                                       time_convert:            int = 1000,
+                                       ci_accuracy:             float = 1.645,
+                                       synchronous_threshold:   float = 0.1):
+        # Don't bother doing this if there isn't enough data. You would need to handle a bunch of edge cases anyways.
+        if self.original_data.__len__() < 4:
+            return
+
+        self.ci_sensitivity = ci_accuracy
+        # time_convert = 1000 is intended to convert seconds to milliseconds.
+        freq_intervals = self.original_data.index[1:] - self.original_data.index[:-1]
+        self.freq_mean = mean(freq_intervals) * time_convert
+        self.freq_std = std(freq_intervals, ddof=1)*time_convert
+        # Assumes distribution of freq_intervals is gaussian normal.
+        mean_offset = ci_accuracy * self.freq_std / sqrt(len(freq_intervals))
+        self.freq_ci = (self.freq_mean - mean_offset, self.freq_mean + mean_offset)
+        # mean_to_ci_ratio is the ratio of the CI range to the mean value. This is to provide heuristic to determine
+        # if this Arb ID can be called 'synchronous' given the time scale of its average transmission frequency.
+        # For example, imagine an Arb ID transmitting with a mean frequency of exactly one second. Its 90% Confidence
+        # Interval spans 50 milliseconds about that ideal mean. It's reasonable to assume this Arb ID was engineered to
+        # be a 'synchronous' signal. However, a different Arb ID with a mean frequency of 40 milliseconds and the same
+        # confidence interval could not be reasonably assumed to have been engineered to be 'synchronous'. This other
+        # Arb ID was engineered to be 'high frequency' compared to the first example, but there isn't evidence that it's
+        # intended to be a high frequency 'synchronous' process that adheres to a certain clock frequency.
+        # This inference relies upon the assumption that the OEM didn't engineer the bus to be a train wreck with some
+        # IDs losing an arbitrarily large percentage of their arbitration phases.
+        self.mean_to_ci_ratio = 2*mean_offset/self.freq_mean
+        if self.mean_to_ci_ratio <= synchronous_threshold:
+            self.synchronous = True
--- a/Pipeline/J1979.py
+++ b/Pipeline/J1979.py
@ -0,0 +1,97 @@
+from pandas import DataFrame, Series
+from numpy import int8, float16, uint8, uint16
+
+
+class J1979:
+    def __init__(self, pid: int,  original_data: DataFrame):
+        self.pid:   int = pid
+        self.title: str = ""
+        self.data:  Series = self.process_response_data(original_data)
+        print("Found " + str(self.data.shape[0]) + " responses for J1979 PID " + str(hex(self.pid)) + ":", self.title)
+
+    def process_response_data(self, original_data) -> Series:
+        # ISO-TP formatted Universal Diagnostic Service (UDS) requests that were sent by the CAN collection device
+        # during sampling. Request made using Arb ID 0x7DF with DLC of 8. Response should use Arb ID 7E8 (0x7DF + 0x8).
+
+        #                    DataFrame Columns: b0 b1 b2 b3 ... b7
+        #                                       -- -- -- --     --
+        # PID 0x0C (12 dec) (Engine RPM):       02 01 0c 00 ... 00
+        # PID 0x0D (13 dec) (Vehicle Speed):    02 01 0d 00 ... 00
+        # PID 0x11 (17 dec) (Trottle Pos.):     02 01 11 00 ... 00
+        # PID 0x61 (97 dec) (Demand % Torque):  02 01 61 00 ... 00
+        # PID 0x62 (98 dec) (Engine % Torque):  02 01 62 00 ... 00
+        # PID 0x63 (99 dec) (Ref. Torque):      02 01 63 00 ... 00
+        # PID 0x8e (142 dec) (Friction Torque): 02 01 8e 00 ... 00
+
+        # Responses being managed here should follow the ISO-TP + UDS per-byte format AA BB CC DD .. DD
+        # BYTE:         AA                    BB             CC          DD ... DD
+        # USE:  response size (bytes)   UDS mode + 0x40    UDS PID     response data
+        # DF COLUMN:    b0                    b1             b2          b3 ... b7
+
+        # Remember that this response data is already converted to decimal. Thus, byte BB = 65 = 0x41 = 0x01 + 0x40.
+        # If BB isn't 0x41, check what the error code is. Some error code are listed in the UDS chapter of the car
+        # hacker's handbook available at http://opengarages.org/handbook/ebook/.
+        if self.pid == 12:
+            self.title = 'Engine RPM'
+            # PID is 0x0C: Engine RPM. 2 byte of data AA BB converted using 1/4 RPM per bit: (256*AA+BB)/4
+            # Min value: 0      Max value: 16,383.75    units: rpm
+            return Series(data=(256*original_data['b3']+original_data['b4'])/4,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=float16)
+        elif self.pid == 13:
+            self.title = 'Speed km/h'
+            # PID is 0x0D: Vehicle Speed. 1 byte of data AA using 1km/h per bit: no conversion necessary
+            # Min value: 0      Max value: 255 (158.44965mph)   units: km/h
+            return Series(data=original_data['b3'],
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=uint8)
+        elif self.pid == 17:
+            self.title = 'Throttle %'
+            # PID is 0x11: Throttle Position. 1 byte of data AA using 100/255 % per bit: AA * 100/255% throttle.
+            # Min value: 0      Max value: 100          units: %
+            return Series(data=100 * original_data['b3'] / 255,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=uint8)
+        elif self.pid == 97:
+            self.title = 'Demand Torque %'
+            # PID is 0x61: Driver's demand engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
+            # AA - 125
+            # Min value: -125   Max value: 130          units: %
+            return Series(data=original_data['b3'] - 125,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=int8)
+        elif self.pid == 98:
+            self.title = 'Actual Torque %'
+            # PID is 0x62: Actual engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
+            # AA - 125
+            # Min value: -125   Max value: 130          units: %
+            return Series(data=original_data['b3'] - 125,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=int8)
+        elif self.pid == 99:
+            self.title = 'Reference Torque Nm'
+            # PID is 0x63: Engine reference torque. 2 byte of data AA BB using 1 Nm/bit: 256*AA + BB Nm torque
+            # Min value: 0   Max value: 65,535          units: Nm
+            return Series(data=256*original_data['b3'] + original_data['b4'],
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=uint16)
+        elif self.pid == 142:
+            self.title = 'Engine Friction Torque %'
+            # PID is 0x8E: Engine Friction - Percent Torque. 1 byte of data AA using 1%/bit with -125 offset. AA - 125
+            # Min value: -125   Max value: 130          units: %
+            return Series(data=original_data['b3'] - 125,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=int8)
+        else:
+            # Looks like you were requesting J1979 data with your sniffer that hasn't been implemented in this code.
+            # Time to do the leg work to expand this class accordingly then re-run the pipeline.
+            raise ValueError("Encountered J1979 PID " + str(hex(self.pid)) +
+                             " in Pre-Processing that hasn't been programmed. Expand the J1979 class to handle all "
+                             "J1979 requests made during data collection.")
--- a/Pipeline/LexicalAnalysis.py
+++ b/Pipeline/LexicalAnalysis.py
@ -0,0 +1,177 @@
+from numpy import float64, nditer, uint64, zeros
+from pandas import Series
+from os import path, remove
+from pickle import load
+from ArbID import ArbID
+from Signal import Signal
+from PipelineTimer import PipelineTimer
+
+
+def tokenize_dictionary(a_timer:            PipelineTimer,
+                        d:                  dict,
+                        force:              bool = False,
+                        include_padding:    bool = False,
+                        merge:              bool = True,
+                        max_distance:       float= 0.1):
+    a_timer.start_function_time()
+
+    for k, arb_id in d.items():
+        if not arb_id.static:
+            if arb_id.padding and not force:
+                print("\nTokenization already completed and forcing is turned off. Skipping...")
+                return
+            a_timer.start_iteration_time()
+            get_composition(arb_id, include_padding, max_distance)
+            a_timer.set_tang_to_composition()
+            if merge:
+                a_timer.start_iteration_time()
+                merge_tokens(arb_id, max_distance)
+                a_timer.set_composition_merge()
+    a_timer.set_tokenization()
+
+
+# This is a greedy algorithm to cluster bit positions in a series of CAN payloads suspected of being part of a
+# continuous numerical time series.
+def get_composition(arb_id: ArbID, include_padding=False, max_inversion_distance: float = 0.0):
+    tokens = []
+    start_index = 0
+    currently_clustering = False
+    big_endian = True
+    last_bit_position = 0
+
+    # Consider each element in the TANG. The TANG is an ndarray with index being bit position from the
+    # original CAN data. The cell value is the observed transition frequency for that bit position.
+    for i, bit_position in enumerate(nditer(arb_id.tang)):
+        # Is this a padding bit?
+        if bit_position <= 0.000001:
+            arb_id.padding.append(i)
+            # Are we clustering padding bits? If so, proceed to the normal clustering logic. Else, do the following.
+            if not include_padding:
+                if currently_clustering:
+                    # This is padding, we're already clustering, and we're not clustering padding; save the token.
+                    tokens.append((start_index, i - 1))
+                    currently_clustering = False
+                    start_index = i + 1
+                    last_bit_position = bit_position
+                continue
+
+        # Are we still enlarging the current token?
+        if currently_clustering:
+            if bit_position >= last_bit_position and big_endian:
+                pass
+            elif bit_position <= last_bit_position and not big_endian:
+                pass
+            # Are we allowing inversions (max_inversion_distance > 0)? If so, check if this inversion is acceptable.
+            elif abs(bit_position-last_bit_position) <= max_inversion_distance:
+                pass
+            # Is this the second bit position we need to establish the endian of the signal?
+            elif start_index == i - 1:
+                if bit_position >= last_bit_position:
+                    big_endian = True
+                else:
+                    big_endian = False
+            # This is an unacceptable transition frequency inversion, save the current token and start a new one
+            else:
+                tokens.append((start_index, i - 1))
+                start_index = i
+        # We aren't currently clustering and we intend to cluster this bit position
+        else:
+            currently_clustering = True
+            start_index = i
+
+        last_bit_position = bit_position
+
+    # We reached the last bit position while clustering. Add this final token.
+    if currently_clustering:
+        tokens.append((start_index, arb_id.tang.__len__() - 1))
+
+    arb_id.tokenization = tokens
+
+
+def merge_tokens(arb_id: ArbID, max_distance):
+    # if arb_id.id == 292:  # make this equal to the decimal value of an Arb ID in the data you want to see get merged
+    #     verbose = True
+    # else:
+    verbose = False
+    if verbose:
+        print("\nENTERING MERGE PHASE OF ARB ID", arb_id.id)
+        print("STARTING TOKENS:", arb_id.tokenization)
+
+    if arb_id.static or arb_id.tokenization.__len__() < 2:
+        # Make sure there's multiple tokens to marge
+        pass
+    else:
+        # Editing data structures while iterating over them is a bad idea in Python
+        # Instead, lets keep track of tokens we want to delete using remove_list.
+        remove_list = []
+        last = None
+        for i, token in enumerate(arb_id.tokenization):
+            if verbose:
+                print("Last:", last, "\tCurrent:", token)
+            if last:
+                # Are these tokens adjacent?
+                if last[1] + 1 == token[0]:
+                    if verbose:
+                        print("\tAdjacent with distance of", abs(arb_id.tang[last[1]] - arb_id.tang[token[0]]))
+                    # Is the transition frequency of the adjacent bit positions less than the max distance threshold?
+                    if abs(arb_id.tang[last[1]] - arb_id.tang[token[0]]) <= max_distance:
+                        remove_list.append(last)
+                        token = (last[0], token[1])
+                        arb_id.tokenization[i] = token
+                        if verbose:
+                            print("\t\tMerged into", token)
+            last = token
+        if remove_list:
+            for token in remove_list:
+                arb_id.tokenization.remove(token)
+            if verbose:
+                print("final tokenization", arb_id.tokenization)
+
+
+# noinspection PyTypeChecker
+def          generate_signals(a_timer: PipelineTimer,
+                     arb_id_dict: dict,
+                     signal_pickle_filename: str,
+                     normalize_strategy,
+                     force=False):
+    if path.isfile(signal_pickle_filename):
+        if force:
+            # Remove any existing pickled Signal dictionary and create one.
+            remove(signal_pickle_filename)
+        else:
+            print("\nSignal generation already completed and forcing is turned off. Using pickled data...")
+            return load(open(signal_pickle_filename, "rb"))
+
+    a_timer.start_function_time()
+
+    signal_dict = {}
+
+    for k, arb_id in arb_id_dict.items():
+        if not arb_id.static:
+            for token in arb_id.tokenization:
+                a_timer.start_iteration_time()
+
+                signal = Signal(k, token[0], token[1])
+
+                # Convert the binary ndarray to a list of string representations of each row
+                temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
+                temp2 = zeros((temp1.__len__(), 1), dtype=uint64)
+                # convert each string representation to int
+                for i, row in enumerate(temp1):
+                    temp2[i] = int(row, 2)
+
+                # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+                signal.time_series = Series(temp2[:, 0], index=arb_id.original_data.index, dtype=float64)
+                # Normalize the signal and update its meta-data
+                signal.normalize_and_set_metadata(normalize_strategy)
+                # add this signal to the signal dictionary which is keyed by Arbitration ID
+                if k in signal_dict:
+                    signal_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
+                else:
+                    signal_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
+
+                a_timer.set_token_to_signal()
+
+    a_timer.set_signal_generation()
+
+    return signal_dict
--- a/Pipeline/Main.py
+++ b/Pipeline/Main.py
@ -0,0 +1,198 @@
+from os import chdir, mkdir, path, remove
+from pickle import dump
+from sklearn.preprocessing import minmax_scale
+from typing import Callable
+from PreProcessor import PreProcessor
+from LexicalAnalysis import tokenize_dictionary, generate_signals
+from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal_clustering, label_propagation, \
+    j1979_signal_labeling
+from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster
+from PipelineTimer import PipelineTimer
+
+# File names for the on-disc data input and output.
+# Input:
+can_data_filename:          str = 'drive_runway_afit.log'
+# can_data_filename:          str = 'loggerProgram0.log'
+
+# Output:
+output_folder:              str = 'output'
+pickle_arb_id_filename:     str = 'pickleArbIDs.p'
+pickle_j1979_filename:      str = 'pickleJ1979.p'
+pickle_signal_filename:     str = 'pickleSignals.p'
+pickle_subset_filename:     str = 'pickleSubset.p'
+csv_correlation_filename:   str = 'subset_correlation_matrix.csv'
+pickle_j1979_correlation:   str = 'pickleJ1979_correlation.p'
+pickle_clusters_filename:   str = 'pickleClusters.p'
+pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
+csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
+pickle_timer_filename:      str = 'pickleTimer.p'
+
+# Change out the normalization strategies as needed.
+tang_normalize_strategy:    Callable = minmax_scale
+signal_normalize_strategy:  Callable = minmax_scale
+
+# Turn on or off portions of the pipeline and output methods using these flags.
+force_pre_processing:       bool = False
+force_j1979_plotting:       bool = False
+
+force_lexical_analysis:     bool = False
+force_arb_id_plotting:      bool = True
+
+force_semantic_analysis:    bool = False
+force_signal_labeling:      bool = False
+use_j1979_tags_in_plots:    bool = True
+force_cluster_plotting:     bool = False
+
+dump_to_pickle:             bool = True
+
+# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
+time_conversion = 1000  # convert seconds to milliseconds
+z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
+freq_analysis_accuracy = z_lookup[0.9]
+freq_synchronous_threshold = 0.1
+
+# Threshold parameters used during lexical analysis.
+tokenization_bit_distance:  float = 0.2
+tokenize_padding:           bool = True
+
+# Threshold parameters used during semantic analysis
+subset_selection_size:      float = 0.25
+fuzzy_labeling:             bool = True
+min_correlation_threshold:  float = 0.85
+
+# A timer class to record timings throughout the pipeline.
+a_timer = PipelineTimer(verbose=True)
+
+#            DATA IMPORT AND PRE-PROCESSING             #
+pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
+id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
+                                                                           tang_normalize_strategy,
+                                                                           time_conversion,
+                                                                           freq_analysis_accuracy,
+                                                                           freq_synchronous_threshold,
+                                                                           force_pre_processing)
+if j1979_dictionary:
+    plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
+
+
+#                 LEXICAL ANALYSIS                     #
+print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
+tokenize_dictionary(a_timer,
+                    id_dictionary,
+                    force_lexical_analysis,
+                    include_padding=tokenize_padding,
+                    merge=True,
+                    max_distance=tokenization_bit_distance)
+signal_dictionary = generate_signals(a_timer,
+                                     id_dictionary,
+                                     pickle_signal_filename,
+                                     signal_normalize_strategy,
+                                     force_lexical_analysis)
+plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting)
+
+#                  SEMANTIC ANALYSIS                    #
+print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
+subset_df = subset_selection(a_timer,
+                             signal_dictionary,
+                             pickle_subset_filename,
+                             force_semantic_analysis,
+                             subset_size=subset_selection_size)
+corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
+cluster_dict = greedy_signal_clustering(corr_matrix_subset,
+                                        correlation_threshold=min_correlation_threshold,
+                                        fuzzy_labeling=fuzzy_labeling)
+df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
+                                                            pickle_clusters_filename=pickle_clusters_filename,
+                                                            pickle_all_signals_df_filename=pickle_all_signal_filename,
+                                                            csv_signals_correlation_filename=csv_all_signals_filename,
+                                                            signal_dict=signal_dictionary,
+                                                            cluster_dict=cluster_dict,
+                                                            correlation_threshold=min_correlation_threshold,
+                                                            force=force_semantic_analysis)
+signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
+                                                              j1979_corr_filename=pickle_j1979_correlation,
+                                                              df_signals=df_full,
+                                                              j1979_dict=j1979_dictionary,
+                                                              signal_dict=signal_dictionary,
+                                                              correlation_threshold=min_correlation_threshold,
+                                                              force=force_signal_labeling)
+plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting)
+
+#                     DATA STORAGE                      #
+if dump_to_pickle:
+    if force_pre_processing:
+        if path.isfile(pickle_arb_id_filename):
+            remove(pickle_arb_id_filename)
+        if path.isfile(pickle_j1979_filename):
+            remove(pickle_j1979_filename)
+    if force_lexical_analysis or force_signal_labeling:
+        if path.isfile(pickle_signal_filename):
+            remove(pickle_signal_filename)
+    if force_semantic_analysis:
+        if path.isfile(pickle_subset_filename):
+            remove(pickle_subset_filename)
+        if path.isfile(csv_correlation_filename):
+            remove(csv_correlation_filename)
+        if path.isfile(pickle_j1979_correlation):
+            remove(pickle_j1979_correlation)
+        if path.isfile(pickle_clusters_filename):
+            remove(pickle_clusters_filename)
+        if path.isfile(pickle_all_signal_filename):
+            remove(pickle_all_signal_filename)
+        if path.isfile(csv_all_signals_filename):
+            remove(csv_all_signals_filename)
+
+    timer_flag = 0
+    if not path.exists(output_folder):
+        mkdir(output_folder)
+    chdir(output_folder)
+    if not path.isfile(pickle_arb_id_filename):
+        timer_flag += 1
+        print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
+        dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_j1979_filename):
+        timer_flag += 1
+        print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
+        dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_signal_filename):
+        timer_flag += 1
+        print("\nDumping signal dictionary to " + pickle_signal_filename)
+        dump(signal_dictionary, open(pickle_signal_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_subset_filename):
+        timer_flag += 1
+        print("\nDumping signal subset list to " + pickle_subset_filename)
+        dump(subset_df, open(pickle_subset_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(csv_correlation_filename):
+        timer_flag += 1
+        print("\nDumping subset correlation matrix to " + csv_correlation_filename)
+        corr_matrix_subset.to_csv(csv_correlation_filename)
+        print("\tComplete...")
+    if not path.isfile(pickle_j1979_correlation):
+        timer_flag += 1
+        print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
+        dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_clusters_filename):
+        timer_flag += 1
+        print("\nDumping cluster dictionary to " + pickle_clusters_filename)
+        dump(cluster_dict, open(pickle_clusters_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(pickle_all_signal_filename):
+        timer_flag += 1
+        print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
+        dump(df_full, open(pickle_all_signal_filename, "wb"))
+        print("\tComplete...")
+    if not path.isfile(csv_all_signals_filename):
+        timer_flag += 1
+        print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
+        corr_matrix_full.to_csv(csv_all_signals_filename)
+        print("\tComplete...")
+    if timer_flag is 9:
+        print("\nDumping pipeline timer to " + pickle_timer_filename)
+        dump(a_timer, open(pickle_timer_filename, "wb"))
+        print("\tComplete...")
+    chdir("..")
--- a/Pipeline/PipelineTimer.py
+++ b/Pipeline/PipelineTimer.py
@ -0,0 +1,149 @@
+from time import time
+from typing import List
+
+
+class PipelineTimer:
+    def __init__(self, verbose: bool = True):
+        self.verbose:               bool = verbose
+        self.function_time:         float = 0.0
+        self.nested_function_time:  float = 0.0
+        self.iteration_time:        float = 0.0
+        self.nested_iteration_time: float = 0.0
+
+        # Pre-Processing Timings
+        self.can_csv_to_df:         float = 0.0
+        self.raw_df_to_arb_id_dict: float = 0.0
+        self.arb_id_creation:       List[float] = []
+        self.j1979_creation:        float = 0.0
+        self.hex_to_bool_matrix:    List[float] = []
+        self.bool_matrix_to_tang:   List[float] = []
+        self.plot_save_j1979_dict:  float = 0.0
+        self.plot_save_j1979_pid:   List[float] = []
+
+        # Lexical Analysis Timings
+        self.tokenization:          float = 0.0
+        self.tang_to_composition:   List[float] = []
+        self.composition_merge:     List[float] = []
+        self.signal_generation:     float = 0.0
+        self.token_to_signal:       List[float] = []
+        self.plot_save_arb_id_dict: float = 0.0
+        self.plot_save_arb_id:      List[float] = []
+
+        # Semantic Analysis Timings
+        self.subset_selection:      float = 0.0
+        self.plot_save_cluster_dict: float = 0.0
+        self.label_propagation:     float = 0.0
+        self.plot_save_cluster:     List[float] = []
+
+    def start_function_time(self):
+        self.function_time = time()
+
+    def start_nested_function_time(self):
+        self.nested_function_time = time()
+
+    def start_iteration_time(self):
+        self.iteration_time = time()
+
+    def start_nested_iteration_time(self):
+        self.nested_iteration_time = time()
+
+    #               Pre-Processing Timings                #
+
+    # Called in PreProcessor.import_csv
+    def set_can_csv_to_df(self):
+        self.can_csv_to_df = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.can_csv_to_df) + " seconds to import and format raw data into a DataFrame")
+
+    # Called in PreProcessor.generate_arb_id_dictionary
+    def set_raw_df_to_arb_id_dict(self):
+        self.raw_df_to_arb_id_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.raw_df_to_arb_id_dict) +
+                  " seconds to produce arbitration ID dictionary, boolean matrices, and TANGs")
+
+    # Called in the loop within PreProcessor.generate_arb_id_dictionary
+    def set_arb_id_creation(self):
+        self.arb_id_creation.append(time() - self.iteration_time)
+
+    def set_j1979_creation(self):
+        self.j1979_creation = time() - self.nested_function_time
+        if self.verbose:
+            print("\n" + str(self.j1979_creation) + " seconds to process J1979 response data into a dictionary")
+
+    # Called in ArbID.generate_binary_matrix_and_tang
+    def set_hex_to_bool_matrix(self):
+        self.hex_to_bool_matrix.append(self.nested_function_time - time())
+
+    # Called in ArbID.generate_binary_matrix_and_tang
+    def set_bool_matrix_to_tang(self):
+        self.bool_matrix_to_tang.append(self.nested_function_time - time())
+
+    # Called in the Plotter.py plot_j1979 function.
+    def set_plot_save_j1979_dict(self):
+        self.plot_save_j1979_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.plot_save_j1979_dict) + " seconds to plot and save the J1979 response data")
+
+    # Called in the loop within the Plotter.py plot_j1979 function.
+    def set_plot_save_j1979_pid(self):
+        self.plot_save_j1979_pid.append(time() - self.iteration_time)
+
+    #               Lexical Analysis Timings                #
+
+    # Called in the LexicalAnalysis.py tokenize_dictionary function.
+    def set_tokenization(self):
+        self.tokenization = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.tokenization) + " seconds to tokenize the arbitration ID dictionary using TANGs")
+
+    # Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
+    def set_tang_to_composition(self):
+        self.tang_to_composition.append(time() - self.iteration_time)
+
+    # Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
+    def set_composition_merge(self):
+        self.composition_merge.append(time() - self.iteration_time)
+
+    # Called in the LexicalAnalysis.py generate_signals function.
+    def set_signal_generation(self):
+        self.signal_generation = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.signal_generation) +
+                  " seconds to generate signals and their statistics using token compositions.")
+
+    # Called in the loop within the LexicalAnalysis.py generate_signals function.
+    def set_token_to_signal(self):
+        self.token_to_signal.append(time() - self.iteration_time)
+
+    # Called in the Plotter.py plot_signals_by_arb_id function.
+    def set_plot_save_arb_id_dict(self):
+        self.plot_save_arb_id_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.plot_save_arb_id_dict) + " seconds to plot and save the Signals and TANGs by Arb ID")
+
+    # Called in the loop within the Plotter.py plot_signals_by_arb_id function.
+    def set_plot_save_arb_id(self):
+        self.plot_save_arb_id.append(time() - self.iteration_time)
+
+    #               Semantic Analysis Timings                #
+
+    # Called in the SemanticAnalysis.py subset_correlation function.
+    def set_subset_selection(self):
+        self.subset_selection = time() - self.function_time
+
+    # Called in the SemanticAnalysis.py label_propagation function.
+    def set_label_propagation(self):
+        self.label_propagation = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.label_propagation) + " seconds to perform label propagation.")
+
+    # Called in the Plotter.py plot_signals_by_cluster function.
+    def set_plot_save_cluster_dict(self):
+        self.plot_save_cluster_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.plot_save_cluster_dict) + " seconds to plot and save the clusters.")
+
+    # Called in the loop within the Plotter.py plot_signals_by_cluster function.
+    def set_plot_save_cluster(self):
+        self.plot_save_cluster.append(time() - self.iteration_time)
--- a/Pipeline/Plotter.py
+++ b/Pipeline/Plotter.py
@ -0,0 +1,221 @@
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import savefig
+from numpy import where, isin
+from os import chdir, mkdir, path
+from shutil import rmtree
+from PipelineTimer import PipelineTimer
+
+
+figure_format:  str = "png"  # png works well for quickly previewing results and draft results write ups.
+# figure_format:  str = "eps"  # Journals/Conferences generally prefer EPS file format for camera-ready copies.
+figure_dpi:     int = 300
+figure_transp:  bool = False
+
+arb_id_folder:  str = 'figures'
+cluster_folder: str = 'clusters'
+j1979_folder:   str = 'j1979'
+
+
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False):
+    if path.exists(arb_id_folder):
+        if force:
+            rmtree(arb_id_folder)
+        else:
+            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    for k_id, signals in signal_dict.items():
+        arb_id = arb_id_dict[k_id]
+        if not arb_id.static:
+            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
+            a_timer.start_iteration_time()
+
+            signals_to_plot = []
+            # Don't plot the static signals
+            for k_signal, signal in signals.items():
+                if not signal.static:
+                    signals_to_plot.append(signal)
+            # One row per signal plus one for the TANG
+            fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
+            plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id),
+                         weight='bold',
+                         position=(0.5, 1))
+            fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
+            # The min() statement provides whitespace for the title depending on the number of subplots.
+            size_adjust = len(signals_to_plot) / 100
+            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+            # This adjusts whitespace padding on the left and right of the subplots
+            fig.subplots_adjust(left=0.07, right=0.98)
+            for i, signal in enumerate(signals_to_plot):
+                ax = axes[i]
+                ax.set_title(signal.plot_title,
+                             style='italic',
+                             size='medium')
+                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+                ax.plot(signal.time_series, color='black')
+                # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
+                axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
+
+            # Plot the entropy gradient at the bottom of the overall output
+            ax = axes[-1]
+            ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
+                         style='italic',
+                         size='medium')
+            tang_bit_width = arb_id.tang.shape[0]
+            ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
+            y = arb_id.tang[:]
+            # Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
+            ix = isin(y, 0)
+            pad_bit = where(ix)
+            non_pad_bit = where(~ix)
+            ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
+            ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
+
+            if not path.exists(arb_id_folder):
+                mkdir(arb_id_folder)
+            chdir(arb_id_folder)
+
+            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+            savefig(hex(arb_id.id) + "." + figure_format,
+                    bbox_iches='tight',
+                    pad_inches=0.0,
+                    dpi=figure_dpi,
+                    format=figure_format,
+                    transparent=figure_transp)
+
+            chdir("..")
+
+            plt.close(fig)
+
+            a_timer.set_plot_save_arb_id()
+            print("\tComplete...")
+
+    a_timer.set_plot_save_arb_id_dict()
+
+
+def plot_signals_by_cluster(a_timer: PipelineTimer,
+                            cluster_dict: dict,
+                            signal_dict: dict,
+                            use_j1979_tags: bool,
+                            force: bool=False):
+    if path.exists(cluster_folder):
+        if force:
+            rmtree(cluster_folder)
+        else:
+            print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    print("\n")
+    for cluster_number, list_of_signals in cluster_dict.items():
+        print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
+        a_timer.start_iteration_time()
+
+        # Setup the plot
+        fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
+        plt.suptitle("Cluster Number " + str(cluster_number),
+                     weight='bold',
+                     position=(0.5, 1))
+        fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
+
+        size_adjust = len(list_of_signals) / 100
+        # The min() statement provides whitespace for the suptitle depending on the number of subplots.
+        plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+        # This adjusts whitespace padding on the left and right of the subplots
+        fig.subplots_adjust(left=0.07, right=0.98)
+
+        # Plot the time series of each signal in the cluster
+        for i, signal_key in enumerate(list_of_signals):
+            signal = signal_dict[signal_key[0]][signal_key]
+            ax = axes[i, 0]
+            if signal.j1979_title and use_j1979_tags:
+                this_title = signal.plot_title + " [" + signal.j1979_title + \
+                             " (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
+            else:
+                this_title = signal.plot_title
+            ax.set_title(this_title,
+                         style='italic',
+                         size='medium')
+            ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+            ax.plot(signal.time_series, color='black')
+
+        if not path.exists(cluster_folder):
+            mkdir(cluster_folder)
+        chdir(cluster_folder)
+
+        # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+        savefig("cluster_" + str(cluster_number) + "." + figure_format,
+                bbox_iches='tight',
+                pad_inches=0.0,
+                dpi=figure_dpi,
+                format=figure_format,
+                transparent=figure_transp)
+
+        chdir("..")
+
+        plt.close(fig)
+
+        a_timer.set_plot_save_cluster()
+        print("\tComplete...")
+
+    a_timer.set_plot_save_cluster_dict()
+
+
+def plot_j1979(a_timer: PipelineTimer, j1979_dict: dict, force: bool=False):
+    if path.exists(j1979_folder):
+        if force:
+            rmtree(j1979_folder)
+        else:
+            print("\nJ1979 plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    print("Plotting J1979 response data")
+    plot_length = len(j1979_dict.keys())
+
+    # Setup the plot
+    fig, axes = plt.subplots(nrows=plot_length, ncols=1, squeeze=False)
+    plt.suptitle("J1979 Data",
+                 weight='bold',
+                 position=(0.5, 1))
+    fig.set_size_inches(8, (1 + plot_length) * 1.3)
+
+    size_adjust = plot_length / 100
+    # The min() statement provides whitespace for the suptitle depending on the number of subplots.
+    plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+    # This adjusts whitespace padding on the left and right of the subplots
+    fig.subplots_adjust(left=0.07, right=0.98)
+
+    # Plot the time series of each signal in the cluster
+    for i, (pid, data) in enumerate(j1979_dict.items()):
+        a_timer.start_iteration_time()
+        ax = axes[i, 0]
+        ax.set_title("PID " + str(hex(pid)) + ": " + data.title,
+                     style='italic',
+                     size='medium')
+        ax.set_xlim([data.data.first_valid_index(), data.data.last_valid_index()])
+        ax.plot(data.data, color='black')
+        a_timer.set_plot_save_j1979_pid()
+
+    if not path.exists(j1979_folder):
+        mkdir(j1979_folder)
+    chdir(j1979_folder)
+
+    # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+    savefig("j1979." + figure_format,
+            bbox_iches='tight',
+            pad_inches=0.0,
+            dpi=figure_dpi,
+            format=figure_format,
+            transparent=figure_transp)
+
+    chdir("..")
+
+    plt.close(fig)
+
+    a_timer.set_plot_save_j1979_dict()
+    print("\tComplete...")
--- a/Pipeline/PreProcessor.py
+++ b/Pipeline/PreProcessor.py
@ -0,0 +1,141 @@
+from pandas import DataFrame, read_csv, Series
+from numpy import int64
+from os import path, remove
+from pickle import load
+from typing import Callable
+from ArbID import ArbID
+from J1979 import J1979
+from PipelineTimer import PipelineTimer
+
+
+class PreProcessor:
+    def __init__(self, data_filename: str, id_output_filename: str, j1979_output_filename: str):
+        self.data_filename:         str = data_filename
+        self.id_output_filename:    str = id_output_filename
+        self.j1979_output_filename: str = j1979_output_filename
+        self.data:                  DataFrame = None
+        self.import_time:           float = 0.0
+        self.dictionary_time:       float = 0.0
+        self.total_time:            float = 0.0
+
+    def import_csv(self, a_timer: PipelineTimer, filename):
+
+        def hex2int(x):
+            if x == '':
+                return 0
+            return int(x, 16)
+
+        def fix_time(x):
+            try:
+                return float(str(x)[:-1])
+            except ValueError:
+                # There may have been a newline the capture device was trying to write when turned off.
+                return None
+
+        # Used by pd.read_csv to apply the functions to the respective column vectors in the .csv file
+        convert_dict = {'time': fix_time, 'id': hex2int, 'dlc': hex2int, 'b0': hex2int, 'b1': hex2int, 'b2': hex2int,
+                        'b3': hex2int, 'b4': hex2int, 'b5': hex2int, 'b6': hex2int, 'b7': hex2int}
+
+        print("\nReading in " + self.data_filename + "...")
+
+        a_timer.start_function_time()
+
+        self.data = read_csv(filename,
+                             header=None,
+                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
+                             skiprows=7,
+                             delimiter='\t',
+                             converters=convert_dict,
+                             index_col=0)
+
+        a_timer.set_can_csv_to_df()
+
+        # sanity check output of the original data
+        # print("\nSample of the original data:")
+        # print(self.data.head(5), "\n")
+
+    @staticmethod
+    def generate_j1979_dictionary(j1979_data: DataFrame) -> dict:
+
+        d = {}
+        services = j1979_data.groupby('b2')
+        for uds_pid, data in services:
+            d[uds_pid] = J1979(uds_pid, data)
+        return d
+
+    def generate_arb_id_dictionary(self,
+                                   a_timer:                     PipelineTimer,
+                                   normalize_strategy:          Callable,
+                                   time_conversion:             int = 1000,
+                                   freq_analysis_accuracy:      float = 0.0,
+                                   freq_synchronous_threshold:  float = 0.0,
+                                   force:                       bool = False) -> (dict, dict):
+        if path.isfile(self.id_output_filename):
+            if force:
+                # Remove any existing pickled Arb ID dictionary and create one based on this data.
+                remove(self.id_output_filename)
+                remove(self.j1979_output_filename)
+                self.import_csv(a_timer, self.data_filename)
+            else:
+                arb_id_dict = load(open(self.id_output_filename, "rb"))
+                j1979_dict = load(open(self.j1979_output_filename, "rb"))
+                return arb_id_dict, j1979_dict
+        else:
+            self.import_csv(a_timer, self.data_filename)
+
+        id_dictionary = {}
+        j1979_dictionary = {}
+
+        a_timer.start_function_time()
+
+        for arb_id in Series.unique(self.data['id']):
+            if isinstance(arb_id, int64):
+                if arb_id == 2015:
+                    # This is the J1979 requests (if any) (ID 0x7DF = 2015). Just ignore it.
+                    continue
+                elif arb_id == 2024:
+                    # This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024)
+                    j1979_data = self.data.loc[self.data['id'] == arb_id].copy()
+                    j1979_data.drop('dlc', axis=1, inplace=True)
+                    j1979_data.drop('id', axis=1, inplace=True)
+                    a_timer.start_nested_function_time()
+                    j1979_dictionary = self.generate_j1979_dictionary(j1979_data)
+                    a_timer.set_j1979_creation()
+                elif arb_id > 0:
+                    a_timer.start_iteration_time()
+
+                    this_id = ArbID(arb_id)
+                    this_id.original_data = self.data.loc[self.data['id'] == arb_id]
+                    this_id.original_data = this_id.original_data.copy()  # type: DataFrame
+
+                    # Check if the Arbitration ID always used the same DLC. If not, ignore it.
+                    # We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary.
+                    if this_id.original_data['dlc'].nunique() is not 1:
+                        continue
+                    this_id.dlc = this_id.original_data['dlc'].iloc[0]
+                    this_id.original_data.drop('dlc', axis=1, inplace=True)
+                    this_id.original_data.drop('id', axis=1, inplace=True)
+
+                    # If DLC < 8, we can automatically drop data column vectors > DLC.
+                    # E.G. drop bytes "B7" and "B6" if DLC is 6; those are padding data injected by can-dump and were
+                    # not actually on the bus.
+                    if this_id.dlc < 8:
+                        for i in range(this_id.dlc, 8):
+                            this_id.original_data.drop('b' + str(i), axis=1, inplace=True)
+
+                    # Check if there are duplicate index values and correct them.
+                    if not this_id.original_data.index.is_unique:
+                        correction_mask = this_id.original_data.index.duplicated()
+                        this_id.original_data = this_id.original_data[~correction_mask]
+
+                    this_id.generate_binary_matrix_and_tang(a_timer, normalize_strategy)
+                    this_id.analyze_transmission_frequency(time_convert=time_conversion,
+                                                           ci_accuracy=freq_analysis_accuracy,
+                                                           synchronous_threshold=freq_synchronous_threshold)
+                    id_dictionary[arb_id] = this_id
+
+                    a_timer.set_arb_id_creation()
+
+        a_timer.set_raw_df_to_arb_id_dict()
+
+        return id_dictionary, j1979_dictionary
--- a/Pipeline/SemanticAnalysis.py
+++ b/Pipeline/SemanticAnalysis.py
@ -0,0 +1,336 @@
+from pandas import concat, DataFrame, read_csv
+from numpy import zeros
+from os import path, remove
+from pickle import load
+from ast import literal_eval
+from J1979 import J1979
+from Signal import Signal
+from PipelineTimer import PipelineTimer
+
+
+def subset_selection(a_timer:       PipelineTimer,
+                     signal_dict:   dict = None,
+                     subset_pickle: str = "",
+                     force:         bool = False,
+                     subset_size:   float = 0.25) -> DataFrame:
+
+    if path.isfile(subset_pickle):
+        if force:
+            # Remove any existing pickled Signal dictionary and create one.
+            remove(subset_pickle)
+        else:
+            print("\nSubset selection already completed and forcing is turned off. Using pickled data...")
+            return load(open(subset_pickle, "rb"))
+
+    a_timer.start_function_time()
+
+    signal_index = 0
+    for k_arb_id, arb_id_signals in signal_dict.items():
+        for k_signal_id, signal in arb_id_signals.items():
+            if not signal.static:
+                signal_index += 1
+
+    # setup subset selection data structure
+    df: DataFrame = DataFrame(zeros((signal_index, 4)),
+                              columns=["arb_id", "start_index", "stop_index", "Shannon_Index"])
+
+    for i, (k_arb_id, arb_id_signals) in enumerate(signal_dict.items()):
+        for j, (k_signal_id, signal) in enumerate(arb_id_signals.items()):
+            if not signal.static:
+                df.iloc[signal_index-1] = [k_arb_id, signal.start_index, signal.stop_index, signal.shannon_index]
+                signal_index -= 1
+
+    # sort by Shannon Index
+    df.sort_values(by="Shannon_Index", inplace=True, ascending=False)
+
+    # Select subset with largest Shannon Index Values
+    df = df.head(int(round(df.__len__() * subset_size, 0)))
+
+    # In order to make an arb ID sorted output, sort this subset by arb_id
+    df.sort_values(by="arb_id", inplace=True)
+
+    # Re-index each Signal in the subset using the Signal with the most observed samples. Prepare to create a DataFrame
+    # that can be used for generating a correlation matrix.
+    subset = []
+    subset_cols = []
+    largest_index = []
+
+    for index, row in df.iterrows():
+        signal_id = (int(row[0]), int(row[1]), int(row[2]))
+        signal = signal_dict[row[0]][signal_id]
+        subset.append(signal)
+        subset_cols.append(signal_id)
+        if signal.time_series.__len__() > largest_index.__len__():
+            largest_index = signal.time_series.index
+
+    subset_df: DataFrame = DataFrame(zeros((largest_index.__len__(), subset.__len__())),
+                                     columns=subset_cols,
+                                     index=largest_index)
+
+    for signal in subset:
+        signal_id = (signal.arb_id, signal.start_index, signal.stop_index)
+        subset_df[signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
+
+    a_timer.set_subset_selection()
+
+    return subset_df
+
+
+def subset_correlation(subset: DataFrame,
+                       csv_correlation_filename: str,
+                       force: bool = False) -> DataFrame:
+    if not force and path.isfile(csv_correlation_filename):
+        print("\nA subset correlation appears to exist and forcing is turned off. Using " + csv_correlation_filename)
+        # Read the .csv into a DataFrame. Also, we need to convert the columns and index from strings back to tuples.
+        # Pandas.read_csv brings the data in as a DataFrame. Pandas.DataFrame.rename converts the columns and index with
+        # ast.literal_eval. Literal_eval will convert a string representation of a tuple back to an actual tuple.
+        return read_csv(csv_correlation_filename, index_col=0).rename(index=literal_eval, columns=literal_eval)
+    else:
+        return subset.corr()
+
+
+def greedy_signal_clustering(correlation_matrix: DataFrame = None,
+                             correlation_threshold: float = 0.8,
+                             fuzzy_labeling: bool = True) -> dict:
+
+    correlation_keys = correlation_matrix.columns.values
+    previously_clustered_signals = {}
+    cluster_dict = {}
+    new_cluster_label = 0
+
+    for n, row in enumerate(correlation_keys):
+        for m, col in enumerate(correlation_keys):
+            if n == m:
+                # this is a diagonal on the correlation matrix. Skip it.
+                continue
+            # I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
+            result = round(correlation_matrix.iloc[n, m], 2)
+
+            # check if this is a significant correlation according to our heuristic threshold.
+            if result >= correlation_threshold:
+                # Check if the current row signal is currently unlabeled
+                if row not in previously_clustered_signals.keys():
+                    # Check if the current col signal is currently unlabeled
+                    if col not in previously_clustered_signals.keys():
+                        # Both signals are unlabeled. Create a new one.
+                        cluster_dict[new_cluster_label] = [row, col]
+                        previously_clustered_signals[row] = {new_cluster_label}
+                        previously_clustered_signals[col] = {new_cluster_label}
+                        # print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
+                        new_cluster_label += 1
+                    else:
+                        # Row isn't labeled but col is; add row to all of col's clusters.
+                        # print("adding", row, "to clusters", previously_clustered_signals[col])
+                        # row is not already in a cluster, add it to col's set of clusters
+                        for label in previously_clustered_signals[col]:
+                            cluster_dict[label].append(row)
+                        previously_clustered_signals[row] = previously_clustered_signals[col]
+                else:
+                    # Check if the current col signal is currently unlabeled
+                    if col not in previously_clustered_signals.keys():
+                        # Row if labeled but col is not; add col to row's set of clusters
+                        # print("adding", col, "to clusters", previously_clustered_signals[row])
+                        for label in previously_clustered_signals[row]:
+                            cluster_dict[label].append(col)
+                        previously_clustered_signals[col] = previously_clustered_signals[row]
+                    # Both signals are already labeled
+                    else:
+                        # Check if we're using fuzzy labeling (a signal can belong to multiple clusters).
+                        # If so, check if the union of both sets of labels is the empty set. If so, this is a
+                        # relationship that hasn't already been captures by an existing cluster. Make a new one.
+                        if fuzzy_labeling:
+                            row_label_set = previously_clustered_signals[row]
+                            col_label_set = previously_clustered_signals[col]
+                            if not row_label_set & col_label_set:
+                                cluster_dict[new_cluster_label] = [row, col]
+                                previously_clustered_signals[row] = {new_cluster_label} | row_label_set
+                                previously_clustered_signals[col] = {new_cluster_label} | col_label_set
+                                # print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
+                                new_cluster_label += 1
+                            else:
+                                # We're using fuzzy labeling and these two signals represent a 'bridge' between two
+                                # signal clusters. Fold col into row's clusters and delete col's unique cluster indices.
+                                for label in row_label_set - col_label_set:
+                                    cluster_dict[label].append(col)
+                                previously_clustered_signals[col] = row_label_set | col_label_set
+                                for label in col_label_set - row_label_set:
+                                    cluster_dict[label].append(row)
+                                previously_clustered_signals[row] = row_label_set | col_label_set
+                        # print(row, col, "already in cluster_dict", previously_clustered_signals[row], "&",
+                        #       previously_clustered_signals[col])
+
+    # Delete any duplicate clusters
+    cluster_sets = []
+    deletion_labels = []
+    for label, cluster in cluster_dict.items():
+        this_set = set(cluster)
+        if this_set in cluster_sets:
+            deletion_labels.append(label)
+        else:
+            cluster_sets.append(this_set)
+    for label in deletion_labels:
+        del cluster_dict[label]
+
+    return cluster_dict
+
+
+# NOTE: This method has a LOT of redundancy with the subset_selection, signal_correlation, and greedy_signal_clustering
+# logic. If you know you always want to perform label propagation, it would be more efficient to incorporate it directly
+# into those functions. Since this code base is more of a Proof of Concept, label propagation is deliberately pulled
+# out as a distinct method to make the pipeline steps as distinct as possible.
+def label_propagation(a_timer:                          PipelineTimer,
+                      pickle_clusters_filename:         str = '',
+                      pickle_all_signals_df_filename:   str = '',
+                      csv_signals_correlation_filename: str = '',
+                      signal_dict:                      dict = None,
+                      cluster_dict:                     dict = None,
+                      correlation_threshold:            float = 0.8,
+                      force:                            bool = False):
+    if path.isfile(pickle_all_signals_df_filename) and path.isfile(csv_signals_correlation_filename):
+        if force:
+            # Remove any existing data.
+            remove(pickle_all_signals_df_filename)
+            remove(csv_signals_correlation_filename)
+            remove(pickle_clusters_filename)
+        else:
+            print("\nA DataFrame and correlation matrix for label propagation appears to exist and forcing is turned "
+                  "off. Using " + pickle_all_signals_df_filename + ", " + csv_signals_correlation_filename + ", and "
+                  + pickle_clusters_filename)
+            return [load(open(pickle_all_signals_df_filename, "rb")),
+                    read_csv(csv_signals_correlation_filename, index_col=0).rename(index=literal_eval,
+                                                                                   columns=literal_eval),
+                    load(open(pickle_clusters_filename, "rb"))]
+
+    a_timer.start_function_time()
+
+    non_static_signals_dict = {}
+    largest_index = []
+    df_columns = []
+
+    # Put all non-static signals into one DataFrame. Re-index all of them to share the same index.
+    for k_arb_id, arb_id_signals in signal_dict.items():
+        for k_signal_id, signal in arb_id_signals.items():
+            if not signal.static:
+                non_static_signals_dict[k_signal_id] = signal
+                df_columns.append(k_signal_id)
+                if signal.time_series.__len__() > largest_index.__len__():
+                    largest_index = signal.time_series.index
+
+    df: DataFrame = DataFrame(zeros((largest_index.__len__(), df_columns.__len__())),
+                              columns=df_columns,
+                              index=largest_index)
+
+    for k_signal_id, signal in non_static_signals_dict.items():
+        df[k_signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
+
+    # Calculate the correlation matrix for this DataFrame of all non-static signals.
+    correlation_matrix = df.corr()
+
+    # Re-run the algorithm from greedy_signal_clustering but omitting the logic for creating new clusters.
+    # This effectively propagates the labels generated by the subset of signals with the largest Shannon Index values
+    # to any correlated signals which were not part of that subset.
+    correlation_keys = correlation_matrix.columns.values
+    previously_clustered_signals = {}
+    for k_cluster_id, cluster in cluster_dict.items():
+        for k_signal_id in cluster:
+            previously_clustered_signals[k_signal_id] = k_cluster_id
+
+    for n, row in enumerate(correlation_keys):
+        for m, col in enumerate(correlation_keys):
+            if n == m:
+                # this is a diagonal on the correlation matrix. Skip it.
+                continue
+            # I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
+            result = round(correlation_matrix.iloc[n, m], 2)
+
+            # check if this is a significant correlation according to our heuristic threshold.
+            if result >= correlation_threshold:
+                # if row signal is already a member of a cluster
+                if row in previously_clustered_signals.keys():
+                    # if col signal is already a member of a cluster
+                    if col in previously_clustered_signals.keys():
+                        # print(row, col, "already in clusters", previously_clustered_signals[row], "&",
+                        #       previously_clustered_signals[col])
+                        continue
+                    # if col is not already in a cluster, add it to row's cluster
+                    else:
+                        # print("adding", col, "to cluster", clusters[previously_clustered_signals[row]])
+                        cluster_dict[previously_clustered_signals[row]].append(col)
+                        previously_clustered_signals[col] = previously_clustered_signals[row]
+                # row signal hasn't been added to a cluster
+                else:
+                    # if col signal is already a member of a cluster
+                    if col in previously_clustered_signals.keys():
+                        # print("adding", row, "to cluster", clusters[previously_clustered_signals[col]])
+                        # row is not already in a cluster, add it to col's cluster
+                        cluster_dict[previously_clustered_signals[col]].append(row)
+                        previously_clustered_signals[row] = previously_clustered_signals[col]
+
+    a_timer.set_label_propagation()
+
+    df.dropna(axis=0, how='any', inplace=True)
+    df.dropna(axis=1, how='any', inplace=True)
+
+    return df, correlation_matrix, cluster_dict
+
+
+def j1979_signal_labeling(a_timer:               PipelineTimer,
+                          j1979_corr_filename:   str = "",
+                          df_signals:            DataFrame = None,
+                          j1979_dict:            dict = None,
+                          signal_dict:           dict = None,
+                          correlation_threshold: float = 0.8,
+                          force:                 bool = False):
+
+    if path.isfile(j1979_corr_filename):
+        if force:
+            # Remove any existing data.
+            remove(j1979_corr_filename)
+        else:
+            print("\nA J1979 correlation matrix for signal labeling appears to exist and forcing is turned off. Using "
+                  + j1979_corr_filename)
+            return signal_dict, load(open(j1979_corr_filename, "rb"))
+
+    latest_start_index = 0.0
+    earliest_end_index = 99999999999999.9
+    df_columns = []
+
+    for pid, pid_data in j1979_dict.items():  # type: int, J1979
+        if latest_start_index < pid_data.data.index[0]:
+            latest_start_index = pid_data.data.index[0]
+        if earliest_end_index > pid_data.data.index[-1]:
+            earliest_end_index = pid_data.data.index[-1]
+        df_columns.append(pid_data.title)
+
+    df_signals = df_signals.loc[latest_start_index:earliest_end_index]  # type: DataFrame
+
+    df_j1979: DataFrame = DataFrame(zeros((df_signals.shape[0], df_columns.__len__())),
+                                    columns=df_columns,
+                                    index=df_signals.index)
+
+    for pid, pid_data in j1979_dict.items():  # type: int, J1979
+        df_j1979[pid_data.title] = pid_data.data.reindex(index=df_signals.index, method='nearest')
+
+    df_combined = concat([df_signals, df_j1979], axis=1)
+
+    correlation_matrix = df_combined.corr()
+
+    correlation_matrix.dropna(axis=1, how='all', inplace=True)
+    correlation_matrix.dropna(axis=0, how='all', inplace=True)
+
+    # Just consider the J1979 column correlations. Slice off the identity rows at the end of the DataFrame as well.
+    for index, row in correlation_matrix[df_columns][:-len(df_columns)].iterrows():
+        row = abs(row)
+        max_index = row.idxmax(axis=1, skipna=True)
+        if row[max_index] >= correlation_threshold:
+            signal = signal_dict[index[0]][index]  # type: Signal
+            # print("Adding J1979 attribute " + max_index + " to signal ID " + str(index))
+            signal.j1979_title = max_index
+            signal.j1979_pcc = row[max_index]
+            signal_dict[index[0]][index] = signal
+
+            # print(i, index, row[max_index], max_index, row.values)
+
+    return signal_dict, correlation_matrix[df_columns][:-len(df_columns)]
+
+    # correlation_matrix.to_csv('j1979_correlation.csv')
--- a/Pipeline/Signal.py
+++ b/Pipeline/Signal.py
@ -0,0 +1,43 @@
+from pandas import Series
+from math import log10
+
+
+class Signal:
+    def __init__(self, arb_id: int, start_index: int, stop_index: int):
+        self.arb_id:        int = arb_id
+        self.start_index:   int = start_index
+        self.stop_index:    int = stop_index
+        self.time_series:   Series = None
+        self.static:        bool = True
+        self.shannon_index: float = 0
+        self.plot_title:    str = ""
+        self.j1979_title:   str = None
+        self.j1979_pcc:     float = 0
+
+    def normalize_and_set_metadata(self, normalize_strategy):
+        self.set_shannon_index()
+        self.update_static()
+        self.set_plot_title()
+        self.normalize(normalize_strategy)
+
+    def set_shannon_index(self):
+        si: float = 0.0
+        n: int = self.time_series.__len__()
+        for count in self.time_series.value_counts():
+            # calculate proportion of this integer value in the total population of values
+            p_i = count / n
+            # calculate the Shannon Index (si) given p_i of this value.
+            si += p_i * log10(p_i)
+        si *= -1
+        self.shannon_index = si
+
+    def update_static(self):
+        if self.shannon_index >= .000001:
+            self.static = False
+
+    def set_plot_title(self):
+        self.plot_title = "Time Series from Bit Positions " + str(self.start_index) + " to " + str(self.stop_index) + \
+                          " of Arb ID " + hex(int(self.arb_id))
+
+    def normalize(self, normalize_strategy):
+        normalize_strategy(self.time_series.values, copy=False)
--- a/Pipeline_multi-file/ArbID.py
+++ b/Pipeline_multi-file/ArbID.py
@ -0,0 +1,97 @@
+from typing import Callable, List
+from pandas import DataFrame
+from numpy import float64, logical_xor, mean, ndarray, sqrt, std, sum, uint8, zeros
+from PipelineTimer import PipelineTimer
+
+
+# noinspection PyArgumentList
+class ArbID:
+    def __init__(self, arb_id: int):
+        self.id:                int = arb_id
+        # These features are set by PreProcessing.py's generate_arb_id_dictionary
+        self.dlc:               int = 0
+        self.original_data:     DataFrame = None
+        # These features are set in generate_binary_matrix_and_tang called by generate_arb_id_dictionary
+        self.boolean_matrix:    ndarray = None
+        self.tang:              ndarray = None
+        # Static and short are just book keeping flags to let other methods know this Arb ID prob isn't worth analyzing
+        self.static:            bool = True
+        self.short:             bool = True
+        # These features are set in analyze_transmission_frequency called by generate_arb_id_dictionary
+        self.ci_sensitivity:    float = 0.0
+        self.freq_mean:         float = 0.0
+        self.freq_std:          float = 0.0
+        self.freq_ci:           tuple = None
+        self.mean_to_ci_ratio:  float = 0.0
+        self.synchronous:       bool = False
+        # These features are set by LexicalAnalysis.py's get_composition
+        self.tokenization:      List[tuple] = []
+        self.padding:           List[int] = []
+
+    @staticmethod
+    def generate_tang(boolean_matrix):
+        transition_matrix = logical_xor(boolean_matrix[:-1, ], boolean_matrix[1:, ])
+        return sum(transition_matrix, axis=0, dtype=float64)
+
+    def generate_binary_matrix_and_tang(self, a_timer: PipelineTimer, normalize_strategy: Callable):
+        a_timer.start_nested_function_time()
+
+        self.boolean_matrix = zeros((self.original_data.__len__(), self.dlc * 8), dtype=uint8)
+
+        for i, row in enumerate(self.original_data.itertuples()):
+            for j, cell in enumerate(row[1:]):
+                # Skip cells that were already 0
+                if cell > 0:
+                    # i is the row in the boolean_matrix
+                    # j*8 is the left hand bit for this byte in the payload
+                    # j*8 + 8 is the right hand bit for this byte in the payload
+                    # e.g. byte index 1 starts at bits 1*8 = 8 to 1*8+8 = 16; [8:16]
+                    # likewise, byte index 7 starts at bits 7*8 = 56 to 7*8+8 = 64
+                    # Numpy indexing is non-inclusive of the upper bound. So [0:8] is the first 8 elements
+                    bin_string = format(cell, '08b')
+                    self.boolean_matrix[i, j * 8:j * 8 + 8] = [x == '1' for x in bin_string]
+
+        a_timer.set_hex_to_bool_matrix()
+
+        if self.boolean_matrix.shape[0] > 1:
+            a_timer.start_nested_function_time()
+
+            self.tang = self.generate_tang(boolean_matrix=self.boolean_matrix)
+            # Ensure there is no divide by zero issues caused by an all zero tang vector
+            if max(self.tang) > 0:
+                # TODO: This conditional path should account for there only being one value in all the signals.
+                # see Plotter.py plot_signals_by_arb_id() for how this crashes plotting.
+                normalize_strategy(self.tang, axis=0, copy=False)
+                self.static = False
+            if self.original_data.shape[0] > 4:
+                self.short = False
+
+            a_timer.set_bool_matrix_to_tang()
+
+    def analyze_transmission_frequency(self,
+                                       time_convert:            int = 1000,
+                                       ci_accuracy:             float = 1.645,
+                                       synchronous_threshold:   float = 0.1):
+        if self.short or self.original_data.__len__() < 4:
+            return
+        self.ci_sensitivity = ci_accuracy
+        # time_convert = 1000 is intended to convert seconds to milliseconds.
+        freq_intervals = self.original_data.index[1:] - self.original_data.index[:-1]
+        self.freq_mean = mean(freq_intervals) * time_convert
+        self.freq_std = std(freq_intervals, ddof=1)*time_convert
+        # Assumes distribution of freq_intervals is gaussian normal.
+        mean_offset = ci_accuracy * self.freq_std / sqrt(len(freq_intervals))
+        self.freq_ci = (self.freq_mean - mean_offset, self.freq_mean + mean_offset)
+        # mean_to_ci_ratio is the ratio of the CI range to the mean value. This is to provide heuristic to determine
+        # if this Arb ID can be called 'synchronous' given the time scale of its average transmission frequency.
+        # For example, imagine an Arb ID transmitting with a mean frequency of exactly one second. Its 90% Confidence
+        # Interval spans 50 milliseconds about that ideal mean. It's reasonable to assume this Arb ID was engineered to
+        # be a 'synchronous' signal. However, a different Arb ID with a mean frequency of 40 milliseconds and the same
+        # confidence interval could not be reasonably assumed to have been engineered to be 'synchronous'. This other
+        # Arb ID was engineered to be 'high frequency' compared to the first example, but there isn't evidence that it's
+        # intended to be a high frequency 'synchronous' process that adheres to a certain clock frequency.
+        # This inference relies upon the assumption that the OEM didn't engineer the bus to be a train wreck with some
+        # IDs losing an arbitrarily large percentage of their arbitration phases.
+        self.mean_to_ci_ratio = 2*mean_offset/self.freq_mean
+        if self.mean_to_ci_ratio <= synchronous_threshold:
+            self.synchronous = True
--- a/Pipeline_multi-file/FileBoi.py
+++ b/Pipeline_multi-file/FileBoi.py
@ -0,0 +1,76 @@
+from os import chdir, path, getcwd, walk
+import re
+from Sample import Sample
+
+
+can_data_filename:          str = ''
+
+
+class FileBoi:
+    def __init__(self):
+        self.sample_dict = self.go_fetch()
+        # path.exists(output_folder):
+        #         mkdir(output_folder)
+        #     chdir(output_folder)
+
+    @staticmethod
+    def go_fetch(kfold_n: int = 5):
+        # NOTE: File structure relative to this script is expected to be as follows.
+        # .
+        # +-- Captures
+        # |   +-- Make x_0
+        # |   |   +-- Model y_0
+        # |   |   |   +-- ModelYear z_0
+        # |   |   |   |   +-- Samples
+        # |   |   |   |   |   +-- sample = re.match('loggerProgram[\d]+.log', a_file_in_this_folder)
+        # |   +-- Make x_1.... etc.
+        # +-- Some folder
+        # |   +-- The directory with these scripts
+        # |   |   +-- this_script.py
+
+        script_dir: str = getcwd()
+        chdir("../../")
+        if not path.exists("Captures"):
+            # Make sure your local directory structure matches the example above. If not... adjust accordingly
+            print("Error finding Captures folder. Please check the relative path between this script and Captures.")
+            print("See the source of go_fetch() in FileBoi.py for an example of the expected relative paths.")
+            chdir(script_dir)
+            quit()
+
+        chdir("Captures")
+        root_dir = getcwd()
+        make: str = ""
+        model: str = ""
+        year: str = ""
+        current_vehicle = []
+        sample_dict = {}
+        for dirName, subdirList, fileList in walk(root_dir, topdown=True):
+            this_dir = path.basename(dirName)
+            if len(subdirList) == 0:
+                if len(current_vehicle) == 3:
+                    make = current_vehicle[0]
+                    model = current_vehicle[1]
+                    year = current_vehicle[2]
+                elif len(current_vehicle) == 2:
+                    model = current_vehicle[0]
+                    year = current_vehicle[1]
+                elif len(current_vehicle) == 1 and current_vehicle != "":
+                    year = current_vehicle[0]
+                for file in fileList:
+                    # Check if this file name matches the expected name for a CAN data sample. If so, create new Sample
+                    m = re.match('loggerProgram[\d]+.log', file)
+                    if m:
+                        if not (make, model, year) in sample_dict:
+                            sample_dict[(make, model, year)] = []
+                        this_sample_index = str(len(sample_dict[(make, model, year)]))
+                        this_sample = Sample(make=make, model=model, year=year, sample_index=this_sample_index,
+                                             sample_path=dirName + "\\" + m.group(0), kfold_n=kfold_n)
+                        sample_dict[(make, model, year)].append(this_sample)
+                current_vehicle = []
+            else:
+                if this_dir == "Captures":
+                    continue
+                current_vehicle.append(this_dir)
+
+        chdir(script_dir)
+        return sample_dict
--- a/Pipeline_multi-file/J1979.py
+++ b/Pipeline_multi-file/J1979.py
@ -0,0 +1,97 @@
+from pandas import DataFrame, Series
+from numpy import int8, float16, uint8, uint16
+
+
+class J1979:
+    def __init__(self, pid: int,  original_data: DataFrame):
+        self.pid:   int = pid
+        self.title: str = ""
+        self.data:  Series = self.process_response_data(original_data)
+        print("Found " + str(self.data.shape[0]) + " responses for J1979 PID " + str(hex(self.pid)) + ":", self.title)
+
+    def process_response_data(self, original_data) -> Series:
+        # ISO-TP formatted Universal Diagnostic Service (UDS) requests that were sent by the CAN collection device
+        # during sampling. Request made using Arb ID 0x7DF with DLC of 8. Response should use Arb ID 7E8 (0x7DF + 0x8).
+
+        #                    DataFrame Columns: b0 b1 b2 b3 ... b7
+        #                                       -- -- -- --     --
+        # PID 0x0C (12 dec) (Engine RPM):       02 01 0c 00 ... 00
+        # PID 0x0D (13 dec) (Vehicle Speed):    02 01 0d 00 ... 00
+        # PID 0x11 (17 dec) (Trottle Pos.):     02 01 11 00 ... 00
+        # PID 0x61 (97 dec) (Demand % Torque):  02 01 61 00 ... 00
+        # PID 0x62 (98 dec) (Engine % Torque):  02 01 62 00 ... 00
+        # PID 0x63 (99 dec) (Ref. Torque):      02 01 63 00 ... 00
+        # PID 0x8e (142 dec) (Friction Torque): 02 01 8e 00 ... 00
+
+        # Responses being managed here should follow the ISO-TP + UDS per-byte format AA BB CC DD .. DD
+        # BYTE:         AA                    BB             CC          DD ... DD
+        # USE:  response size (bytes)   UDS mode + 0x40    UDS PID     response data
+        # DF COLUMN:    b0                    b1             b2          b3 ... b7
+
+        # Remember that this response data is already converted to decimal. Thus, byte BB = 65 = 0x41 = 0x01 + 0x40.
+        # If BB isn't 0x41, check what the error code is. Some error code are listed in the UDS chapter of the car
+        # hacker's handbook available at http://opengarages.org/handbook/ebook/.
+        if self.pid == 12:
+            self.title = 'Engine RPM'
+            # PID is 0x0C: Engine RPM. 2 byte of data AA BB converted using 1/4 RPM per bit: (256*AA+BB)/4
+            # Min value: 0      Max value: 16,383.75    units: rpm
+            return Series(data=(256*original_data['b3']+original_data['b4'])/4,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=float16)
+        elif self.pid == 13:
+            self.title = 'Speed km/h'
+            # PID is 0x0D: Vehicle Speed. 1 byte of data AA using 1km/h per bit: no conversion necessary
+            # Min value: 0      Max value: 255 (158.44965mph)   units: km/h
+            return Series(data=original_data['b3'],
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=uint8)
+        elif self.pid == 17:
+            self.title = 'Throttle %'
+            # PID is 0x11: Throttle Position. 1 byte of data AA using 100/255 % per bit: AA * 100/255% throttle.
+            # Min value: 0      Max value: 100          units: %
+            return Series(data=100 * original_data['b3'] / 255,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=uint8)
+        elif self.pid == 97:
+            self.title = 'Demand Torque %'
+            # PID is 0x61: Driver's demand engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
+            # AA - 125
+            # Min value: -125   Max value: 130          units: %
+            return Series(data=original_data['b3'] - 125,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=int8)
+        elif self.pid == 98:
+            self.title = 'Actual Torque %'
+            # PID is 0x62: Actual engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
+            # AA - 125
+            # Min value: -125   Max value: 130          units: %
+            return Series(data=original_data['b3'] - 125,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=int8)
+        elif self.pid == 99:
+            self.title = 'Reference Torque Nm'
+            # PID is 0x63: Engine reference torque. 2 byte of data AA BB using 1 Nm/bit: 256*AA + BB Nm torque
+            # Min value: 0   Max value: 65,535          units: Nm
+            return Series(data=256*original_data['b3'] + original_data['b4'],
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=uint16)
+        elif self.pid == 142:
+            self.title = 'Engine Friction Torque %'
+            # PID is 0x8E: Engine Friction - Percent Torque. 1 byte of data AA using 1%/bit with -125 offset. AA - 125
+            # Min value: -125   Max value: 130          units: %
+            return Series(data=original_data['b3'] - 125,
+                          index=original_data.index,
+                          name=self.title,
+                          dtype=int8)
+        else:
+            # Looks like you were requesting J1979 data with your sniffer that hasn't been implemented in this code.
+            # Time to do the leg work to expand this class accordingly then re-run the pipeline.
+            raise ValueError("Encountered J1979 PID " + str(hex(self.pid)) +
+                             " in Pre-Processing that hasn't been programmed. Expand the J1979 class to handle all "
+                             "J1979 requests made during data collection.")
--- a/Pipeline_multi-file/LexicalAnalysis.py
+++ b/Pipeline_multi-file/LexicalAnalysis.py
@ -0,0 +1,189 @@
+from numpy import float64, nditer, uint64, zeros, ndarray
+from pandas import Series
+from os import path, remove
+from pickle import load
+from ArbID import ArbID
+from Signal import Signal
+from PipelineTimer import PipelineTimer
+from typing import List
+
+
+def tokenize_dictionary(a_timer:            PipelineTimer,
+                        d:                  dict,
+                        force:              bool = False,
+                        include_padding:    bool = False,
+                        merge:              bool = True,
+                        max_distance:       float= 0.1):
+    a_timer.start_function_time()
+
+    for k, arb_id in d.items():
+        if not arb_id.static:
+            if arb_id.padding and not force:
+                print("\nTokenization already completed and forcing is turned off. Skipping...")
+                return
+            a_timer.start_iteration_time()
+            get_composition(arb_id, include_padding, max_distance)
+            a_timer.set_tang_to_composition()
+            if merge:
+                a_timer.start_iteration_time()
+                merge_tokens(arb_id, max_distance)
+                a_timer.set_composition_merge()
+    a_timer.set_tokenization()
+
+
+# This is a greedy algorithm to cluster bit positions in a series of CAN payloads suspected of being part of a
+# continuous numerical time series.
+def get_composition_just_tang(this_tang: ndarray, include_padding=False, max_inversion_distance: float = 0.0):
+    tokens: List[tuple] = []
+    padding = []
+    start_index = 0
+    currently_clustering = False
+    big_endian = True
+    last_bit_position = 0
+
+    # Consider each element in the TANG. The TANG is an ndarray with index being bit position from the
+    # original CAN data. The cell value is the observed transition frequency for that bit position.
+    for i, bit_position in enumerate(nditer(this_tang)):
+        # Is this a padding bit?
+        if bit_position <= 0.000001:
+            padding.append(i)
+            # Are we clustering padding bits? If so, proceed to the normal clustering logic. Else, do the following.
+            if not include_padding:
+                if currently_clustering:
+                    # This is padding, we're already clustering, and we're not clustering padding; save the token.
+                    tokens.append((start_index, i - 1))
+                    currently_clustering = False
+                    start_index = i + 1
+                    last_bit_position = bit_position
+                continue
+
+        # Are we still enlarging the current token?
+        if currently_clustering:
+            if bit_position >= last_bit_position and big_endian:
+                pass
+            elif bit_position <= last_bit_position and not big_endian:
+                pass
+            # Are we allowing inversions (max_inversion_distance > 0)? If so, check if this inversion is acceptable.
+            elif abs(bit_position - last_bit_position) <= max_inversion_distance:
+                pass
+            # Is this the second bit position we need to establish the endian of the signal?
+            elif start_index == i - 1:
+                if bit_position >= last_bit_position:
+                    big_endian = True
+                else:
+                    big_endian = False
+            # This is an unacceptable transition frequency inversion, save the current token and start a new one
+            else:
+                tokens.append((start_index, i - 1))
+                start_index = i
+        # We aren't currently clustering and we intend to cluster this bit position
+        else:
+            currently_clustering = True
+            start_index = i
+
+        last_bit_position = bit_position
+
+    # We reached the last bit position while clustering. Add this final token.
+    if currently_clustering:
+        tokens.append((start_index, this_tang.__len__() - 1))
+
+    return tokens, padding
+
+
+def get_composition(arb_id: ArbID, include_padding=False, max_inversion_distance: float = 0.0):
+    arb_id.tokenization, arb_id.padding = \
+        get_composition_just_tang(arb_id.tang, include_padding, max_inversion_distance)
+
+
+def merge_tokens_just_composition(tokens: list, this_tang, max_distance: float):
+    verbose = False
+
+    if tokens.__len__() < 2:
+        # Make sure there's multiple tokens to marge
+        pass
+    else:
+        # Editing data structures while iterating over them is a bad idea in Python
+        # Instead, lets keep track of tokens we want to delete using remove_list.
+        remove_list = []
+        last = None
+        for i, token in enumerate(tokens):
+            if verbose:
+                print("Last:", last, "\tCurrent:", token)
+            if last:
+                # Are these tokens adjacent?
+                if last[1] + 1 == token[0]:
+                    if verbose:
+                        print("\tAdjacent with distance of", abs(this_tang[last[1]] - this_tang[token[0]]))
+                    # Is the transition frequency of the adjacent bit positions less than the max distance threshold?
+                    if abs(this_tang[last[1]] - this_tang[token[0]]) <= max_distance:
+                        remove_list.append(last)
+                        token = (last[0], token[1])
+                        tokens[i] = token
+                        if verbose:
+                            print("\t\tMerged into", token)
+            last = token
+        if remove_list:
+            for token in remove_list:
+                tokens.remove(token)
+            if verbose:
+                print("final tokenization", tokens)
+    return tokens
+
+
+def merge_tokens(arb_id: ArbID, max_distance):
+    # if arb_id.id == 292:  # make this equal to the decimal value of an Arb ID in the data you want to see get merged
+    #     verbose = True
+    # else:
+    verbose = False
+    if verbose:
+        print("\nENTERING MERGE PHASE OF ARB ID", arb_id.id)
+        print("STARTING TOKENS:", arb_id.tokenization)
+
+    arb_id.tokenization = merge_tokens_just_composition(arb_id.tokenization, arb_id.tang, max_distance)
+
+
+# noinspection PyTypeChecker
+def generate_signals(a_timer: PipelineTimer,
+                     arb_id_dict: dict,
+                     signal_pickle_filename: str,
+                     normalize_strategy,
+                     force=False):
+    if force and path.isfile(signal_pickle_filename):
+        remove(signal_pickle_filename)
+    if path.isfile(signal_pickle_filename):
+            print("\nSignal generation already completed and forcing is turned off. Using pickled data...")
+            return load(open(signal_pickle_filename, "rb"))
+
+    a_timer.start_function_time()
+
+    signal_dict = {}
+
+    for k, arb_id in arb_id_dict.items():
+        if not arb_id.static:
+            for token in arb_id.tokenization:
+                a_timer.start_iteration_time()
+
+                signal = Signal(k, token[0], token[1])
+
+                # Convert the binary ndarray to a list of string representations of each row
+                temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
+                temp2 = zeros((temp1.__len__(), 1), dtype=uint64)
+                # convert each string representation to int
+                for i, row in enumerate(temp1):
+                    temp2[i] = int(row, 2)
+
+                # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+                signal.time_series = Series(temp2[:, 0], index=arb_id.original_data.index, dtype=float64)
+                # Normalize the signal and update its meta-data
+                signal.normalize_and_set_metadata(normalize_strategy)
+                # add this signal to the signal dictionary which is keyed by Arbitration ID
+                if k in signal_dict:
+                    signal_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
+                else:
+                    signal_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
+
+                a_timer.set_token_to_signal()
+
+    a_timer.set_signal_generation()
+
+    return signal_dict
--- a/Pipeline_multi-file/Main.py
+++ b/Pipeline_multi-file/Main.py
@ -0,0 +1,38 @@
+from FileBoi import FileBoi
+from Sample import Sample
+# from Plotter import plot_sample_threshold_heatmap
+
+# Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING?
+kfold_n: int = 5
+current_vehicle_number = 0
+
+good_boi = FileBoi()
+samples = good_boi.go_fetch(kfold_n)
+for key, sample_list in samples.items():  # type: tuple, list
+    for sample in sample_list:  # type: Sample
+        print(current_vehicle_number)
+        print("\nData import and Pre-Processing for " + sample.output_vehicle_dir)
+        id_dict, j1979_dict = sample.pre_process()
+        if j1979_dict:
+            sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number))
+
+        # The following 3-lines of code were intended to find good settings for TANG inversions.... it didn't work?
+        # print("\nFinding optimal lexical analysis threshold parameters for " + sample.output_vehicle_dir)
+        # sample.find_lex_thresholds(id_dict)
+        # plot_sample_threshold_heatmap(sample)
+
+        #                 LEXICAL ANALYSIS                     #
+        print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
+        sample.tokenize_dictionary(id_dict)
+        signal_dict = sample.generate_signals(id_dict, bool(j1979_dict))
+        sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
+
+        #                 LEXICAL ANALYSIS                     #
+        print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####")
+        corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict)
+        if j1979_dict:
+            signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df)
+        cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix)
+        sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
+        sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number))
+        current_vehicle_number += 1
--- a/Pipeline_multi-file/PipelineTimer.py
+++ b/Pipeline_multi-file/PipelineTimer.py
@ -0,0 +1,149 @@
+from time import time
+from typing import List
+
+
+class PipelineTimer:
+    def __init__(self, verbose: bool = True):
+        self.verbose:               bool = verbose
+        self.function_time:         float = 0.0
+        self.nested_function_time:  float = 0.0
+        self.iteration_time:        float = 0.0
+        self.nested_iteration_time: float = 0.0
+
+        # Pre-Processing Timings
+        self.can_csv_to_df:         float = 0.0
+        self.raw_df_to_arb_id_dict: float = 0.0
+        self.arb_id_creation:       List[float] = []
+        self.j1979_creation:        float = 0.0
+        self.hex_to_bool_matrix:    List[float] = []
+        self.bool_matrix_to_tang:   List[float] = []
+        self.plot_save_j1979_dict:  float = 0.0
+        self.plot_save_j1979_pid:   List[float] = []
+
+        # Lexical Analysis Timings
+        self.tokenization:          float = 0.0
+        self.tang_to_composition:   List[float] = []
+        self.composition_merge:     List[float] = []
+        self.signal_generation:     float = 0.0
+        self.token_to_signal:       List[float] = []
+        self.plot_save_arb_id_dict: float = 0.0
+        self.plot_save_arb_id:      List[float] = []
+
+        # Semantic Analysis Timings
+        self.subset_selection:      float = 0.0
+        self.plot_save_cluster_dict: float = 0.0
+        self.label_propagation:     float = 0.0
+        self.plot_save_cluster:     List[float] = []
+
+    def start_function_time(self):
+        self.function_time = time()
+
+    def start_nested_function_time(self):
+        self.nested_function_time = time()
+
+    def start_iteration_time(self):
+        self.iteration_time = time()
+
+    def start_nested_iteration_time(self):
+        self.nested_iteration_time = time()
+
+    #               Pre-Processing Timings                #
+
+    # Called in PreProcessor.import_csv
+    def set_can_csv_to_df(self):
+        self.can_csv_to_df = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.can_csv_to_df) + " seconds to import and format raw data into a DataFrame")
+
+    # Called in PreProcessor.generate_arb_id_dictionary
+    def set_raw_df_to_arb_id_dict(self):
+        self.raw_df_to_arb_id_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.raw_df_to_arb_id_dict) +
+                  " seconds to produce arbitration ID dictionary, boolean matrices, and TANGs")
+
+    # Called in the loop within PreProcessor.generate_arb_id_dictionary
+    def set_arb_id_creation(self):
+        self.arb_id_creation.append(time() - self.iteration_time)
+
+    def set_j1979_creation(self):
+        self.j1979_creation = time() - self.nested_function_time
+        if self.verbose:
+            print("\n" + str(self.j1979_creation) + " seconds to process J1979 response data into a dictionary")
+
+    # Called in ArbID.generate_binary_matrix_and_tang
+    def set_hex_to_bool_matrix(self):
+        self.hex_to_bool_matrix.append(self.nested_function_time - time())
+
+    # Called in ArbID.generate_binary_matrix_and_tang
+    def set_bool_matrix_to_tang(self):
+        self.bool_matrix_to_tang.append(self.nested_function_time - time())
+
+    # Called in the Plotter.py plot_j1979 function.
+    def set_plot_save_j1979_dict(self):
+        self.plot_save_j1979_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.plot_save_j1979_dict) + " seconds to plot and save the J1979 response data")
+
+    # Called in the loop within the Plotter.py plot_j1979 function.
+    def set_plot_save_j1979_pid(self):
+        self.plot_save_j1979_pid.append(time() - self.iteration_time)
+
+    #               Lexical Analysis Timings                #
+
+    # Called in the LexicalAnalysis.py tokenize_dictionary function.
+    def set_tokenization(self):
+        self.tokenization = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.tokenization) + " seconds to tokenize the arbitration ID dictionary using TANGs")
+
+    # Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
+    def set_tang_to_composition(self):
+        self.tang_to_composition.append(time() - self.iteration_time)
+
+    # Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
+    def set_composition_merge(self):
+        self.composition_merge.append(time() - self.iteration_time)
+
+    # Called in the LexicalAnalysis.py generate_signals function.
+    def set_signal_generation(self):
+        self.signal_generation = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.signal_generation) +
+                  " seconds to generate signals and their statistics using token compositions.")
+
+    # Called in the loop within the LexicalAnalysis.py generate_signals function.
+    def set_token_to_signal(self):
+        self.token_to_signal.append(time() - self.iteration_time)
+
+    # Called in the Plotter.py plot_signals_by_arb_id function.
+    def set_plot_save_arb_id_dict(self):
+        self.plot_save_arb_id_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.plot_save_arb_id_dict) + " seconds to plot and save the Signals and TANGs by Arb ID")
+
+    # Called in the loop within the Plotter.py plot_signals_by_arb_id function.
+    def set_plot_save_arb_id(self):
+        self.plot_save_arb_id.append(time() - self.iteration_time)
+
+    #               Semantic Analysis Timings                #
+
+    # Called in the SemanticAnalysis.py subset_correlation function.
+    def set_subset_selection(self):
+        self.subset_selection = time() - self.function_time
+
+    # Called in the SemanticAnalysis.py label_propagation function.
+    def set_label_propagation(self):
+        self.label_propagation = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.label_propagation) + " seconds to perform label propagation.")
+
+    # Called in the Plotter.py plot_signals_by_cluster function.
+    def set_plot_save_cluster_dict(self):
+        self.plot_save_cluster_dict = time() - self.function_time
+        if self.verbose:
+            print("\n" + str(self.plot_save_cluster_dict) + " seconds to plot and save the clusters.")
+
+    # Called in the loop within the Plotter.py plot_signals_by_cluster function.
+    def set_plot_save_cluster(self):
+        self.plot_save_cluster.append(time() - self.iteration_time)
--- a/Pipeline_multi-file/Plotter.py
+++ b/Pipeline_multi-file/Plotter.py
@ -0,0 +1,313 @@
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import savefig
+from numpy import where, isin
+from os import chdir, mkdir, path, remove
+from shutil import rmtree
+from PipelineTimer import PipelineTimer
+from scipy.cluster.hierarchy import dendrogram
+
+
+figure_format:  str = "png"  # png works well for quickly previewing results and draft results write ups.
+# figure_format:  str = "eps"  # Journals/Conferences generally prefer EPS file format for camera-ready copies.
+figure_dpi:     int = 300
+figure_transp:  bool = False
+
+arb_id_folder:  str = 'figures'
+cluster_folder: str = 'clusters'
+j1979_folder:   str = 'j1979'
+threshold_folder: str = 'threshold_heatmaps'
+
+
+def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str,
+                           force: bool=False):
+    if path.exists(arb_id_folder):
+        if force:
+            rmtree(arb_id_folder)
+        else:
+            print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    for k_id, signals in signal_dict.items():
+        arb_id = arb_id_dict[k_id]
+        if not arb_id.static and not arb_id.short:
+            print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
+            a_timer.start_iteration_time()
+
+            signals_to_plot = []
+            # Don't plot the static signals
+            for k_signal, signal in signals.items():
+                if not signal.static:
+                    signals_to_plot.append(signal)
+            # There's a corner case where the Arb ID only has static signals. This conditional accounts for this.
+            # TODO: This corner case should probably be reflected by arb_id.static.
+            if len(signals_to_plot) < 1:
+                continue
+            # One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors.
+            fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
+            plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number,
+                         weight='bold',
+                         position=(0.5, 1))
+            fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
+            # The min() statement provides whitespace for the title depending on the number of subplots.
+            size_adjust = len(signals_to_plot) / 100
+            plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+            # This adjusts whitespace padding on the left and right of the subplots
+            fig.subplots_adjust(left=0.07, right=0.98)
+            for i, signal in enumerate(signals_to_plot):
+                ax = axes[i]
+                ax.set_title(signal.plot_title,
+                             style='italic',
+                             size='medium')
+                ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+                ax.plot(signal.time_series, color='black')
+                # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
+                axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
+
+            # Plot the entropy gradient at the bottom of the overall output
+            ax = axes[-1]
+            ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
+                         style='italic',
+                         size='medium')
+            tang_bit_width = arb_id.tang.shape[0]
+            ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
+            y = arb_id.tang[:]
+            # Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
+            ix = isin(y, 0)
+            pad_bit = where(ix)
+            non_pad_bit = where(~ix)
+            ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
+            ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
+
+            if not path.exists(arb_id_folder):
+                mkdir(arb_id_folder)
+            chdir(arb_id_folder)
+
+            # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+            savefig(hex(arb_id.id) + "." + figure_format,
+                    bbox_iches='tight',
+                    pad_inches=0.0,
+                    dpi=figure_dpi,
+                    format=figure_format,
+                    transparent=figure_transp)
+
+            chdir("..")
+
+            plt.close(fig)
+
+            a_timer.set_plot_save_arb_id()
+            print("\tComplete...")
+
+    a_timer.set_plot_save_arb_id_dict()
+
+
+def plot_signals_by_cluster(a_timer: PipelineTimer,
+                            cluster_dict: dict,
+                            signal_dict: dict,
+                            use_j1979_tags: bool,
+                            vehicle_number: str,
+                            force: bool=False):
+    if path.exists(cluster_folder):
+        if force:
+            rmtree(cluster_folder)
+        else:
+            print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    print("\n")
+    for cluster_number, list_of_signals in cluster_dict.items():
+        print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
+        a_timer.start_iteration_time()
+
+        # Setup the plot
+        fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
+        plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number,
+                     weight='bold',
+                     position=(0.5, 1))
+        fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
+
+        size_adjust = len(list_of_signals) / 100
+        # The min() statement provides whitespace for the suptitle depending on the number of subplots.
+        plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+        # This adjusts whitespace padding on the left and right of the subplots
+        fig.subplots_adjust(left=0.07, right=0.98)
+
+        # Plot the time series of each signal in the cluster
+        for i, signal_key in enumerate(list_of_signals):
+            signal = signal_dict[signal_key[0]][signal_key]
+            ax = axes[i, 0]
+            if signal.j1979_title and use_j1979_tags:
+                this_title = signal.plot_title + " [" + signal.j1979_title + \
+                             " (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
+            else:
+                this_title = signal.plot_title
+            ax.set_title(this_title,
+                         style='italic',
+                         size='medium')
+            ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
+            ax.plot(signal.time_series, color='black')
+
+        if not path.exists(cluster_folder):
+            mkdir(cluster_folder)
+        chdir(cluster_folder)
+
+        # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+        savefig("cluster_" + str(cluster_number) + "." + figure_format,
+                bbox_iches='tight',
+                pad_inches=0.0,
+                dpi=figure_dpi,
+                format=figure_format,
+                transparent=figure_transp)
+
+        chdir("..")
+
+        plt.close(fig)
+
+        a_timer.set_plot_save_cluster()
+        print("\tComplete...")
+
+    a_timer.set_plot_save_cluster_dict()
+
+
+def plot_j1979(a_timer: PipelineTimer, j1979_dict: dict, vehicle_number: str, force: bool=False):
+    if path.exists(j1979_folder):
+        if force:
+            rmtree(j1979_folder)
+        else:
+            print("\nJ1979 plotting appears to have already been done and forcing is turned off. Skipping...")
+            return
+
+    a_timer.start_function_time()
+
+    print("Plotting J1979 response data")
+    plot_length = len(j1979_dict.keys())
+
+    # Setup the plot
+    fig, axes = plt.subplots(nrows=plot_length, ncols=1, squeeze=False)
+    plt.suptitle("J1979 Data Collected from Vehicle " + vehicle_number,
+                 weight='bold',
+                 position=(0.5, 1))
+    fig.set_size_inches(8, (1 + plot_length) * 1.3)
+
+    size_adjust = plot_length / 100
+    # The min() statement provides whitespace for the suptitle depending on the number of subplots.
+    plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
+    # This adjusts whitespace padding on the left and right of the subplots
+    fig.subplots_adjust(left=0.07, right=0.98)
+
+    # Plot the time series of each signal in the cluster
+    for i, (pid, data) in enumerate(j1979_dict.items()):
+        a_timer.start_iteration_time()
+        ax = axes[i, 0]
+        ax.set_title("PID " + str(hex(pid)) + ": " + data.title,
+                     style='italic',
+                     size='medium')
+        ax.set_xlim([data.data.first_valid_index(), data.data.last_valid_index()])
+        ax.plot(data.data, color='black')
+        a_timer.set_plot_save_j1979_pid()
+
+    if not path.exists(j1979_folder):
+        mkdir(j1979_folder)
+    chdir(j1979_folder)
+
+    # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+    savefig("j1979." + figure_format,
+            bbox_iches='tight',
+            pad_inches=0.0,
+            dpi=figure_dpi,
+            format=figure_format,
+            transparent=figure_transp)
+
+    chdir("..")
+    plt.close(fig)
+
+    a_timer.set_plot_save_j1979_dict()
+    print("\tComplete...")
+
+
+def plot_sample_threshold_heatmap(sample):
+    this_figure_name = "alignment_scores_" + sample.output_vehicle_dir + "." + figure_format
+
+    if path.exists(threshold_folder):
+        chdir(threshold_folder)
+        if path.isfile(this_figure_name):
+            if sample.force_threshold_plot:
+                remove(this_figure_name)
+            else:
+                print("\nThreshold heatmap plotting for " + sample.output_vehicle_dir + " already complete.")
+                print("\tHeatmap plot forcing is turned off. Skipping...")
+                chdir("..")
+                return
+        chdir("..")
+
+    print("\tPlotting threshold parameter-Alignment Score heatmap for " + sample.output_vehicle_dir)
+    if not path.exists(threshold_folder):
+        mkdir(threshold_folder)
+    chdir(threshold_folder)
+
+    fig, ax = plt.subplots()
+    halfway_mark = int(round(sample.avg_score_matrix.shape[0]/2, 0))
+    sample.avg_score_matrix = sample.avg_score_matrix[0:halfway_mark, 0:halfway_mark].astype(float)
+    sample.validator.set_lex_threshold_parameters(sample)
+    im = ax.imshow(sample.avg_score_matrix, cmap='inferno', interpolation='none', vmin=0.0, vmax=1.0)
+    # im = ax.imshow(sample.avg_score_matrix, cmap='Greys', interpolation='nearest')
+    # ax.set_xticks(arange(sample.avg_score_matrix.shape[1]))
+    # ax.set_yticks(arange(sample.avg_score_matrix.shape[0]))
+    cbar = ax.figure.colorbar(im, ax=ax)
+    cbar.ax.set_ylabel("Alignment Score", rotation=-90, va="bottom")
+    ax.set_title("Average Alignment Score as a Function of \n Lexical Analysis Threshold Parameters for " +
+                 sample.year + " " + sample.make + " " + sample.model + "\nInversion: " +
+                 str(round(sample.optimal_bit_dist, 2)) + " Merge: " + str(round(sample.optimal_merge_dist, 2)) +
+                 " Score: " +
+                 str(round(sample.avg_score_matrix[sample.optimal_bit_dist, sample.optimal_merge_dist], 2)))
+    fig.tight_layout()
+
+    # If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
+    savefig(this_figure_name,
+            bbox_iches='tight',
+            pad_inches=0.0,
+            dpi=figure_dpi,
+            format=figure_format,
+            transparent=figure_transp)
+
+    plt.close()
+    chdir("..")
+    print("\t\tComplete...")
+
+
+def plot_dendrogram(a_timer: PipelineTimer,
+                    linkage_matrix,
+                    threshold: float,
+                    vehicle_number: str,
+                    force: bool = False):
+    dendrogram_filename = "dendrogram_" + vehicle_number + "." + figure_format
+    if path.isfile(dendrogram_filename):
+        if force:
+            remove(dendrogram_filename)
+        else:
+            print("Dendrogram already plotted. Skipping...")
+            return
+    plt.figure(figsize=(7, 7), dpi=600)
+    R: dict = dendrogram(Z=linkage_matrix, orientation='top', distance_sort='ascending', no_labels=True)
+    plt.title("Dendrogram of Agglomerative Clustering for Vehicle " + vehicle_number)
+    plt.xlabel("Signals Observed")
+    plt.ylabel("Single Linkage Cluster Merge Distance")
+    xmin, xmax = plt.xlim()
+    # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
+    plt.hlines(y=threshold, xmin=xmin, xmax=xmax, alpha=0.25, colors='black', linestyle='dashed',
+               label='cluster threshold')
+    plt.legend(loc='upper right')
+
+    print("\tPlotting dendrogram and saving to " + dendrogram_filename)
+
+    savefig(dendrogram_filename,
+            bbox_iches='tight',
+            pad_inches=0.0,
+            dpi=600,
+            format=figure_format,
+            transparent=figure_transp)
+    plt.close()
+    print("\t\tComplete...")
--- a/Pipeline_multi-file/PreProcessor.py
+++ b/Pipeline_multi-file/PreProcessor.py
@ -0,0 +1,148 @@
+from pandas import DataFrame, read_csv, Series
+from numpy import int64
+from os import path, remove, getcwd
+from pickle import load
+from typing import Callable
+from ArbID import ArbID
+from J1979 import J1979
+from PipelineTimer import PipelineTimer
+
+
+class PreProcessor:
+    def __init__(self, data_filename: str, id_output_filename: str, j1979_output_filename: str, use_j1979: bool):
+        self.data_filename:         str = data_filename
+        self.id_output_filename:    str = id_output_filename
+        self.j1979_output_filename: str = j1979_output_filename
+        self.data:                  DataFrame = None
+        self.import_time:           float = 0.0
+        self.dictionary_time:       float = 0.0
+        self.total_time:            float = 0.0
+        self.use_j1979:             bool = use_j1979
+
+    def import_csv(self, a_timer: PipelineTimer, filename):
+
+        def hex2int(x):
+            if x == '':
+                return 0
+            return int(x, 16)
+
+        def fix_time(x):
+            try:
+                return float(str(x)[:-1])
+            except ValueError:
+                # There may have been a newline the capture device was trying to write when turned off.
+                return None
+
+        # Used by pd.read_csv to apply the functions to the respective column vectors in the .csv file
+        convert_dict = {'time': fix_time, 'id': hex2int, 'dlc': hex2int, 'b0': hex2int, 'b1': hex2int, 'b2': hex2int,
+                        'b3': hex2int, 'b4': hex2int, 'b5': hex2int, 'b6': hex2int, 'b7': hex2int}
+
+        print("\nReading in " + self.data_filename + "...")
+
+        a_timer.start_function_time()
+
+        self.data = read_csv(filename,
+                             header=None,
+                             names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
+                             skiprows=7,
+                             delimiter='\t',
+                             converters=convert_dict,
+                             index_col=0)
+
+        a_timer.set_can_csv_to_df()
+
+        # sanity check output of the original data
+        # print("\nSample of the original data:")
+        # print(self.data.head(5), "\n")
+
+    @staticmethod
+    def generate_j1979_dictionary(j1979_data: DataFrame) -> dict:
+
+        d = {}
+        services = j1979_data.groupby('b2')
+        for uds_pid, data in services:
+            d[uds_pid] = J1979(uds_pid, data)
+        return d
+
+    def generate_arb_id_dictionary(self,
+                                   a_timer:                     PipelineTimer,
+                                   normalize_strategy:          Callable,
+                                   time_conversion:             int = 1000,
+                                   freq_analysis_accuracy:      float = 0.0,
+                                   freq_synchronous_threshold:  float = 0.0,
+                                   force:                       bool = False) -> (dict, dict):
+        id_dictionary = {}
+        j1979_dictionary = {}
+
+        if force:
+            # Remove any existing pickled Arb ID and J1979 dictionaries and create new ones based on data_filename.
+            if path.isfile(self.id_output_filename):
+                remove(self.id_output_filename)
+            if path.isfile(self.j1979_output_filename):
+                remove(self.j1979_output_filename)
+            self.import_csv(a_timer, self.data_filename)
+        elif path.isfile(self.id_output_filename):
+            # This logic assumes that there will be a J1979 dict if and only if there is an Arb ID dict
+            print("\tLoading Arb ID dictionary from pickled data: " + getcwd() + "\\" + self.id_output_filename)
+            id_dictionary = load(open(self.id_output_filename, "rb"))
+            if path.isfile(self.j1979_output_filename):
+                print("\tLoading J1979 dictionary from pickled data: " + getcwd() + "\\" + self.j1979_output_filename)
+                j1979_dictionary = load(open(self.j1979_output_filename, "rb"))
+            print("\tSet 'force_pre_processing' in Sample.py to True to re-compute instead...")
+            return id_dictionary, j1979_dictionary
+        else:
+            self.import_csv(a_timer, self.data_filename)
+
+        a_timer.start_function_time()
+
+        for arb_id in Series.unique(self.data['id']):
+            if isinstance(arb_id, int64):
+                if arb_id == 2015:
+                    # This is the J1979 requests (if any) (ID 0x7DF = 2015). Just ignore it.
+                    continue
+                elif arb_id == 2024 and self.use_j1979:
+                    # This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024)
+                    j1979_data = self.data.loc[self.data['id'] == arb_id].copy()
+                    j1979_data.drop('dlc', axis=1, inplace=True)
+                    j1979_data.drop('id', axis=1, inplace=True)
+                    a_timer.start_nested_function_time()
+                    j1979_dictionary = self.generate_j1979_dictionary(j1979_data)
+                    a_timer.set_j1979_creation()
+                elif arb_id > 0:
+                    a_timer.start_iteration_time()
+
+                    this_id = ArbID(arb_id)
+                    this_id.original_data = self.data.loc[self.data['id'] == arb_id]
+                    this_id.original_data = this_id.original_data.copy()  # type: DataFrame
+
+                    # Check if the Arbitration ID always used the same DLC. If not, ignore it.
+                    # We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary.
+                    if this_id.original_data['dlc'].nunique() is not 1:
+                        continue
+                    this_id.dlc = this_id.original_data['dlc'].iloc[0]
+                    this_id.original_data.drop('dlc', axis=1, inplace=True)
+                    this_id.original_data.drop('id', axis=1, inplace=True)
+
+                    # If DLC < 8, we can automatically drop data column vectors > DLC.
+                    # E.G. drop bytes "B7" and "B6" if DLC is 6; those are padding data injected by can-dump and were
+                    # not actually on the bus.
+                    if this_id.dlc < 8:
+                        for i in range(this_id.dlc, 8):
+                            this_id.original_data.drop('b' + str(i), axis=1, inplace=True)
+
+                    # Check if there are duplicate index values and correct them.
+                    if not this_id.original_data.index.is_unique:
+                        correction_mask = this_id.original_data.index.duplicated()
+                        this_id.original_data = this_id.original_data[~correction_mask]
+
+                    this_id.generate_binary_matrix_and_tang(a_timer, normalize_strategy)
+                    this_id.analyze_transmission_frequency(time_convert=time_conversion,
+                                                           ci_accuracy=freq_analysis_accuracy,
+                                                           synchronous_threshold=freq_synchronous_threshold)
+                    id_dictionary[arb_id] = this_id
+
+                    a_timer.set_arb_id_creation()
+
+        a_timer.set_raw_df_to_arb_id_dict()
+
+        return id_dictionary, j1979_dictionary
--- a/Pipeline_multi-file/Sample.py
+++ b/Pipeline_multi-file/Sample.py
@ -0,0 +1,305 @@
+from PreProcessor import PreProcessor
+from Validator import Validator
+from LexicalAnalysis import tokenize_dictionary, generate_signals
+from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling
+from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram
+from sklearn.preprocessing import minmax_scale
+from typing import Callable
+from PipelineTimer import PipelineTimer
+from os import chdir, mkdir, path, remove
+from pickle import dump, load
+from numpy import ndarray, zeros, float16
+from pandas import DataFrame
+
+# File names for the on-disc data input and output.
+output_folder:              str = 'output'
+pickle_arb_id_filename:     str = 'pickleArbIDs.p'
+pickle_threshold_filename:  str = 'pickleLexThresholds.p'
+pickle_j1979_filename:      str = 'pickleJ1979.p'
+pickle_signal_filename:     str = 'pickleSignals.p'
+pickle_subset_filename:     str = 'pickleSubset.p'
+csv_corr_matrix_filename:   str = 'subset_correlation_matrix.csv'
+pickle_j1979_corr_matrix:   str = 'pickleJ1979_correlation.p'
+pickle_clusters_filename:   str = 'pickleClusters.p'
+pickle_linkage_filename:    str = 'pickleLinkage.p'
+pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p'
+csv_all_signals_filename:   str = 'complete_correlation_matrix.csv'
+pickle_timer_filename:      str = 'pickleTimer.p'
+
+dump_to_pickle:             bool = True
+
+# Change out the normalization strategies as needed.
+tang_normalize_strategy:    Callable = minmax_scale
+signal_normalize_strategy:  Callable = minmax_scale
+
+# Turn on or off portions of the pipeline and output methods using these flags.
+force_pre_processing:       bool = False
+force_threshold_search:     bool = False
+force_threshold_plotting:   bool = False
+force_j1979_plotting:       bool = True
+use_j1979:                  bool = True
+
+force_lexical_analysis:     bool = False
+force_signal_generation:    bool = False
+force_arb_id_plotting:      bool = True
+
+force_correlation_matrix:   bool = False
+force_clustering:           bool = False
+force_signal_labeling:      bool = False
+use_j1979_tags_in_plots:    bool = True
+force_cluster_plotting:     bool = True
+force_dendrogram_plotting:  bool = True
+
+# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
+time_conversion = 1000  # convert seconds to milliseconds
+z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
+freq_analysis_accuracy = z_lookup[0.9]
+freq_synchronous_threshold = 0.1
+
+# Threshold parameters used during lexical analysis.
+tokenization_bit_distance:  float = 0.2
+tokenize_padding:           bool = True
+merge_tokens:               bool = True
+
+# Threshold parameters used during semantic analysis
+subset_selection_size:      float = 0.25
+max_intra_cluster_distance: float = 0.20
+min_j1979_correlation:      float = 0.85
+# fuzzy_labeling:             bool = True
+
+
+# A timer class to record timings throughout the pipeline.
+a_timer = PipelineTimer(verbose=True)
+
+
+class Sample:
+    def __init__(self, make: str, model: str, year: str, sample_index: str, sample_path: str, kfold_n: int):
+        # Sample Specific Meta-Data
+        self.make:                      str = make
+        self.model:                     str = model
+        self.year:                      str = year
+        self.path:                      str = sample_path
+        self.output_vehicle_dir:        str = make + "_" + model + "_" + year
+        self.output_sample_dir:         str = sample_index
+        # Pre-Processing Settings
+        self.use_j1979:                 bool = use_j1979
+        self.force_threshold_plot:      bool = force_threshold_plotting
+        self.avg_score_matrix:          ndarray = zeros((1, 1), dtype=float16)
+        # Lexical analysis settings
+        self.tang_inversion_bit_dist:   float = tokenization_bit_distance
+        self.use_padding:               bool = tokenize_padding
+        self.merge_tokens:              bool = merge_tokens
+        self.token_merge_dist:          float = tokenization_bit_distance
+        # Semantic analysis settings
+        self.max_inter_cluster_dist:    float = max_intra_cluster_distance
+        # Various comparison testing methods are implemented in the Validator class
+        self.validator:                 Validator = Validator(use_j1979, kfold_n)
+
+    def make_and_move_to_vehicle_directory(self):
+        # This drills down three directories to './output/make_model_year/sample_index/' Make directories as needed
+        if not path.exists(output_folder):
+            mkdir(output_folder)
+        chdir(output_folder)
+        if not path.exists(self.output_vehicle_dir):
+            mkdir(self.output_vehicle_dir)
+        chdir(self.output_vehicle_dir)
+        if not path.exists(self.output_sample_dir):
+            mkdir(self.output_sample_dir)
+        chdir(self.output_sample_dir)
+
+    @staticmethod
+    def move_back_to_parent_directory():
+        # Move back to root of './output/make_model_year/sample_index/"
+        chdir("../../../")
+
+    def pre_process(self):
+        self.make_and_move_to_vehicle_directory()
+        pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979)
+        id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
+                                                                                   tang_normalize_strategy,
+                                                                                   time_conversion,
+                                                                                   freq_analysis_accuracy,
+                                                                                   freq_synchronous_threshold,
+                                                                                   force_pre_processing)
+        if dump_to_pickle:
+            if force_pre_processing:
+                if path.isfile(pickle_arb_id_filename):
+                    remove(pickle_arb_id_filename)
+                if path.isfile(pickle_j1979_filename):
+                    remove(pickle_j1979_filename)
+            # Lexical analysis will add additional information to the Arb ID dict. Don't dump if you're going to
+            # immediately delete and replace pickle_arb_id_filename during Lexical Analysis.
+            if not force_lexical_analysis:
+                if not path.isfile(pickle_arb_id_filename) and id_dictionary:
+                    print("\nDumping arb ID dictionary for " + self.output_vehicle_dir + " to " +
+                          pickle_arb_id_filename)
+                    dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
+                    print("\tComplete...")
+                if not path.isfile(pickle_j1979_filename) and j1979_dictionary:
+                    print("\nDumping J1979 dictionary for " + self.output_vehicle_dir + " to " + pickle_j1979_filename)
+                    dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
+                    print("\tComplete...")
+        self.move_back_to_parent_directory()
+        return id_dictionary, j1979_dictionary
+
+    def plot_j1979(self, j1979_dictionary: dict, vehicle_number: str):
+        self.make_and_move_to_vehicle_directory()
+        plot_j1979(a_timer, j1979_dictionary, vehicle_number, force_j1979_plotting)
+        self.move_back_to_parent_directory()
+
+    def find_lex_thresholds(self, id_dict: dict):
+        self.make_and_move_to_vehicle_directory()
+        if path.isfile(pickle_threshold_filename):
+            if force_threshold_search:
+                # Remove any existing pickled threshold parameter search result matrix and create one.
+                remove(pickle_threshold_filename)
+            else:
+                print("\n\tLex threshold search already completed and forcing is turned off. Using pickled data...")
+                self.avg_score_matrix = load(open(pickle_threshold_filename, "rb"))
+                self.validator.set_lex_threshold_parameters(self)
+                # Move back to root of './output/make_model_year/sample_index/"
+                self.move_back_to_parent_directory()
+                return
+
+        self.validator.k_fold_lex_threshold_selection(id_dict=id_dict, sample=self)
+
+        if not path.isfile(pickle_threshold_filename):
+            print("\nDumping lexical analysis threshold search matrix for " + self.output_vehicle_dir +
+                  " to " + pickle_threshold_filename)
+            dump(self.avg_score_matrix, open(pickle_threshold_filename, "wb"))
+            print("\tComplete...")
+
+        self.move_back_to_parent_directory()
+
+    def tokenize_dictionary(self, id_dictionary: dict):
+        # Using force_pre_processing = True and force_lexical_analysis = False will cause the 2nd condition to trigger.
+        if force_lexical_analysis or not path.isfile(pickle_arb_id_filename):
+            tokenize_dictionary(a_timer=a_timer, d=id_dictionary, force=force_lexical_analysis,
+                                include_padding=self.use_padding, merge=self.merge_tokens,
+                                max_distance=self.tang_inversion_bit_dist)
+        if dump_to_pickle:
+            self.make_and_move_to_vehicle_directory()
+            if force_lexical_analysis:
+                if path.isfile(pickle_arb_id_filename):
+                    remove(pickle_arb_id_filename)
+            if not path.isfile(pickle_arb_id_filename) and id_dictionary:
+                print("\nDumping arb ID dictionary for " + self.output_vehicle_dir + " to " + pickle_arb_id_filename)
+                dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
+                print("\tComplete...")
+            self.move_back_to_parent_directory()
+
+    def generate_signals(self, id_dictionary: dict, postpone_pickle: bool = False):
+        self.make_and_move_to_vehicle_directory()
+        signal_dict = generate_signals(a_timer=a_timer,
+                                       arb_id_dict=id_dictionary,
+                                       signal_pickle_filename=pickle_signal_filename,
+                                       normalize_strategy=signal_normalize_strategy,
+                                       force=force_signal_generation)
+        # postpone_pickle is simply a check whether J1979 data was present in the sample. If it was present, then wait
+        # to save out the signal_dictionary until correlated Signals are labeled by sample.j1979_labeling().
+        if dump_to_pickle and not postpone_pickle and not path.isfile(pickle_signal_filename):
+            print("\nDumping signal dictionary for " + self.output_vehicle_dir + " to " + pickle_signal_filename)
+            dump(signal_dict, open(pickle_signal_filename, "wb"))
+            print("\tComplete...")
+        self.move_back_to_parent_directory()
+        return signal_dict
+
+    def plot_arb_ids(self, id_dictionary: dict, signal_dictionary: dict, vehicle_number: str):
+        self.make_and_move_to_vehicle_directory()
+        plot_signals_by_arb_id(a_timer=a_timer,
+                               arb_id_dict=id_dictionary,
+                               signal_dict=signal_dictionary,
+                               vehicle_number=vehicle_number,
+                               force=force_arb_id_plotting)
+        self.move_back_to_parent_directory()
+
+    def generate_correlation_matrix(self, signal_dictionary: dict):
+        self.make_and_move_to_vehicle_directory()
+        if dump_to_pickle and force_correlation_matrix:
+            if path.isfile(csv_corr_matrix_filename):
+                remove(csv_corr_matrix_filename)
+        corr_matrix, combined_df = generate_correlation_matrix(a_timer=a_timer,
+                                                               csv_signals_correlation_filename=csv_corr_matrix_filename,
+                                                               combined_df_filename=pickle_combined_df_filename,
+                                                               signal_dict=signal_dictionary,
+                                                               force=force_correlation_matrix)
+        if not path.isfile(csv_corr_matrix_filename) and not corr_matrix.empty:
+            print("\nDumping subset correlation matrix for " + self.output_vehicle_dir + " to " +
+                  csv_corr_matrix_filename)
+            corr_matrix.to_csv(csv_corr_matrix_filename)
+            print("\tComplete...")
+        if not path.isfile(pickle_combined_df_filename) and not combined_df.empty:
+            print("\nDumping combined signal DataFrame matrix for " + self.output_vehicle_dir + " to " +
+                  pickle_combined_df_filename)
+            dump(combined_df, open(pickle_combined_df_filename, "wb"))
+            print("\tComplete...")
+        self.move_back_to_parent_directory()
+        return corr_matrix, combined_df
+
+    def cluster_signals(self, corr_matrix: DataFrame):
+        self.make_and_move_to_vehicle_directory()
+        cluster_dict, linkage_matrix = signal_clustering(corr_matrix,
+                                                         self.max_inter_cluster_dist,
+                                                         pickle_clusters_filename,
+                                                         pickle_linkage_filename,
+                                                         force_clustering)  # type: dict, ndarray
+        # Before we return or save the clusters, lets remove all singleton clusters. This serves as an implicit
+        # filtering technique for incorrectly tokenized signals.
+        list_to_remove = []
+        for k, cluster in cluster_dict.items():
+            if len(cluster) < 2:
+                list_to_remove.append(k)
+        for k in list_to_remove:
+            cluster_dict.pop(k, None)
+
+        if not path.isfile(pickle_clusters_filename) and cluster_dict:
+            print("\nDumping cluster dictionary to " + pickle_clusters_filename)
+            dump(cluster_dict, open(pickle_clusters_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_linkage_filename):
+            print("\nDumping agglomerative clustering linkage matrix to " + pickle_linkage_filename)
+            dump(linkage_matrix, open(pickle_linkage_filename, "wb"))
+            print("\tComplete...")
+
+        self.move_back_to_parent_directory()
+        return cluster_dict, linkage_matrix
+
+    def j1979_labeling(self, j1979_dictionary: dict, signal_dictionary: dict, combined_df: DataFrame):
+        self.make_and_move_to_vehicle_directory()
+        signal_dictionary, j1979_correlation_matrix = j1979_signal_labeling(a_timer=a_timer,
+                                                                            j1979_corr_filename=pickle_j1979_corr_matrix,
+                                                                            signal_filename=pickle_signal_filename,
+                                                                            df_signals=combined_df,
+                                                                            j1979_dict=j1979_dictionary,
+                                                                            signal_dict=signal_dictionary,
+                                                                            correlation_threshold=min_j1979_correlation,
+                                                                            force=force_signal_generation)
+        # If the signal dictionary pickled data was deleted because j1979 tagging needed to happen, then save it with
+        # the tags added by the call to j1979_signal_labeling().
+        if not path.isfile(pickle_signal_filename) and signal_dictionary and j1979_dictionary:
+            print("\nDumping J1979 tagged signal dictionary to " + pickle_signal_filename)
+            dump(signal_dictionary, open(pickle_signal_filename, "wb"))
+            print("\tComplete...")
+        if not path.isfile(pickle_j1979_corr_matrix):
+            print("\nDumping j1979 signal correlation matrix to " + pickle_j1979_corr_matrix)
+            dump(j1979_correlation_matrix, open(pickle_j1979_corr_matrix, "wb"))
+            print("\tComplete...")
+        self.move_back_to_parent_directory()
+        return signal_dictionary, j1979_correlation_matrix
+
+    def plot_clusters(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool,
+                      vehicle_number: str):
+        self.make_and_move_to_vehicle_directory()
+        plot_signals_by_cluster(a_timer=a_timer,
+                                cluster_dict=cluster_dictionary,
+                                signal_dict=signal_dictionary,
+                                use_j1979_tags=use_j1979_tags,
+                                vehicle_number=vehicle_number,
+                                force=force_cluster_plotting)
+        self.move_back_to_parent_directory()
+
+    def plot_dendrogram(self, linkage_matrix: ndarray, vehicle_number: str):
+        self.make_and_move_to_vehicle_directory()
+        plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist,
+                        vehicle_number=vehicle_number, force=force_dendrogram_plotting)
+        self.move_back_to_parent_directory()
--- a/Pipeline_multi-file/SemanticAnalysis.py
+++ b/Pipeline_multi-file/SemanticAnalysis.py
@ -0,0 +1,427 @@
+from pandas import concat, DataFrame, read_csv
+from numpy import ndarray, zeros
+from os import path, remove
+from pickle import load, dump
+from ast import literal_eval
+from J1979 import J1979
+from Signal import Signal
+from PipelineTimer import PipelineTimer
+import scipy.spatial.distance as ssd
+from scipy.cluster.hierarchy import linkage, fcluster
+
+
+def generate_correlation_matrix(a_timer:                          PipelineTimer,
+                                csv_signals_correlation_filename: str = '',
+                                combined_df_filename:             str = '',
+                                signal_dict:                      dict = None,
+                                force:                            bool = False):
+    if force:
+        if path.isfile(csv_signals_correlation_filename):
+            remove(csv_signals_correlation_filename)
+        if path.isfile(combined_df_filename):
+            remove(combined_df_filename)
+    if path.isfile(csv_signals_correlation_filename) and path.isfile(combined_df_filename) and not force:
+        print("\nA signal correlation matrix and combined matrix appears to exist and forcing is turned off. Using " +
+              csv_signals_correlation_filename + " and " + combined_df_filename)
+        # literal_eval converts the textual row/col tuple representation back to actual tuple data structures
+        return [read_csv(csv_signals_correlation_filename, index_col=0).rename(index=literal_eval, columns=literal_eval),
+                load(open(combined_df_filename, "rb"))]
+
+    non_static_signals_dict = {}
+    largest_index = []
+    df_columns = []
+
+    # Put all non-static signals into one DataFrame. Re-index all of them to share the same index.
+    for k_arb_id, arb_id_signals in signal_dict.items():
+        for k_signal_id, signal in arb_id_signals.items():
+            if not signal.static:
+                non_static_signals_dict[k_signal_id] = signal
+                df_columns.append(k_signal_id)
+                if signal.time_series.__len__() > largest_index.__len__():
+                    largest_index = signal.time_series.index
+
+    df: DataFrame = DataFrame(zeros((largest_index.__len__(), df_columns.__len__())),
+                              columns=df_columns,
+                              index=largest_index)
+
+    for k_signal_id, signal in non_static_signals_dict.items():
+        df[k_signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
+
+    # Calculate the correlation matrix for this DataFrame of all non-static signals.
+    corr_matrix = df.corr()
+    # For some reason, despite no NaN and correct data types, the corr() method will return empty row/col for some
+    # signals. We're going to have to clean this up before clustering.
+    corr_matrix.dropna(axis=0, how='all', inplace=True)
+    corr_matrix.dropna(axis=1, how='all', inplace=True)
+
+    # Be sure to reflect any signal IDs dropped from the correlation matrix (due to empty row/col) in the combined DF
+    # as well.
+    return corr_matrix, df.loc[:, corr_matrix.columns.values]
+
+
+def signal_clustering(corr_matrix:      DataFrame,
+                      threshold:        float,
+                      cluster_pickle:   str = "",
+                      linkage_pickle:   str = "",
+                      force:            bool = False):
+    if force:
+        if path.isfile(cluster_pickle):
+            remove(cluster_pickle)
+        if path.isfile(linkage_pickle):
+            remove(linkage_pickle)
+    if path.isfile(cluster_pickle) and path.isfile(linkage_pickle):
+        print("\nSignal clustering already completed and forcing is turned off. Using pickled data...")
+        return [load(open(cluster_pickle, "rb")), load(open(linkage_pickle, "rb"))]
+
+    # Remove negative values from the correlation matrix and invert the values
+    corr_matrix.where(corr_matrix > 0, 0, inplace=True)
+    corr_matrix = 1 - corr_matrix
+    X = corr_matrix.values  # type: ndarray
+    Y = ssd.squareform(X)
+    # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
+    Z = linkage(Y, method='single', optimal_ordering=True)
+    fclus = fcluster(Z, t=threshold, criterion='distance')
+    cluster_dict = {}
+    for i, cluster_label in enumerate(fclus):
+        if cluster_label in cluster_dict:
+            cluster_dict[cluster_label].append(corr_matrix.index[i])
+        else:
+            cluster_dict[cluster_label] = [corr_matrix.index[i]]
+    return cluster_dict, Z
+
+
+def subset_selection(a_timer:       PipelineTimer,
+                     signal_dict:   dict = None,
+                     subset_pickle: str = "",
+                     force:         bool = False,
+                     subset_size:   float = 0.25) -> DataFrame:
+
+    if path.isfile(subset_pickle):
+        if force:
+            # Remove any existing pickled Signal dictionary and create one.
+            remove(subset_pickle)
+        else:
+            print("\nSubset selection already completed and forcing is turned off. Using pickled data...")
+            return load(open(subset_pickle, "rb"))
+
+    a_timer.start_function_time()
+
+    signal_index = 0
+    for k_arb_id, arb_id_signals in signal_dict.items():
+        for k_signal_id, signal in arb_id_signals.items():
+            if not signal.static:
+                signal_index += 1
+
+    # setup subset selection data structure
+    df: DataFrame = DataFrame(zeros((signal_index, 4)),
+                              columns=["arb_id", "start_index", "stop_index", "Shannon_Index"])
+
+    for i, (k_arb_id, arb_id_signals) in enumerate(signal_dict.items()):
+        for j, (k_signal_id, signal) in enumerate(arb_id_signals.items()):
+            if not signal.static:
+                df.iloc[signal_index-1] = [k_arb_id, signal.start_index, signal.stop_index, signal.shannon_index]
+                signal_index -= 1
+
+    # sort by Shannon Index
+    df.sort_values(by="Shannon_Index", inplace=True, ascending=False)
+
+    # Select subset with largest Shannon Index Values
+    df = df.head(int(round(df.__len__() * subset_size, 0)))
+
+    # In order to make an arb ID sorted output, sort this subset by arb_id
+    df.sort_values(by="arb_id", inplace=True)
+
+    # Re-index each Signal in the subset using the Signal with the most observed samples. Prepare to create a DataFrame
+    # that can be used for generating a correlation matrix.
+    subset = []
+    subset_cols = []
+    largest_index = []
+
+    for index, row in df.iterrows():
+        signal_id = (int(row[0]), int(row[1]), int(row[2]))
+        signal = signal_dict[row[0]][signal_id]
+        subset.append(signal)
+        subset_cols.append(signal_id)
+        if signal.time_series.__len__() > largest_index.__len__():
+            largest_index = signal.time_series.index
+
+    subset_df: DataFrame = DataFrame(zeros((largest_index.__len__(), subset.__len__())),
+                                     columns=subset_cols,
+                                     index=largest_index)
+
+    for signal in subset:
+        signal_id = (signal.arb_id, signal.start_index, signal.stop_index)
+        subset_df[signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
+
+    a_timer.set_subset_selection()
+
+    return subset_df
+
+
+def subset_correlation(subset: DataFrame,
+                       csv_correlation_filename: str,
+                       force: bool = False) -> DataFrame:
+    if not force and path.isfile(csv_correlation_filename):
+        print("\nA subset correlation appears to exist and forcing is turned off. Using " + csv_correlation_filename)
+        # Read the .csv into a DataFrame. Also, we need to convert the columns and index from strings back to tuples.
+        # Pandas.read_csv brings the data in as a DataFrame. Pandas.DataFrame.rename converts the columns and index with
+        # ast.literal_eval. Literal_eval will convert a string representation of a tuple back to an actual tuple.
+        return read_csv(csv_correlation_filename, index_col=0).rename(index=literal_eval, columns=literal_eval)
+    else:
+        return subset.corr()
+
+
+def greedy_signal_clustering(correlation_matrix: DataFrame = None,
+                             correlation_threshold: float = 0.8,
+                             fuzzy_labeling: bool = True) -> dict:
+
+    correlation_keys = correlation_matrix.columns.values
+    previously_clustered_signals = {}
+    cluster_dict = {}
+    new_cluster_label = 0
+
+    for n, row in enumerate(correlation_keys):
+        for m, col in enumerate(correlation_keys):
+            if n == m:
+                # this is a diagonal on the correlation matrix. Skip it.
+                continue
+            # I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
+            result = round(correlation_matrix.iloc[n, m], 2)
+
+            # check if this is a significant correlation according to our heuristic threshold.
+            if result >= correlation_threshold:
+                # Check if the current row signal is currently unlabeled
+                if row not in previously_clustered_signals.keys():
+                    # Check if the current col signal is currently unlabeled
+                    if col not in previously_clustered_signals.keys():
+                        # Both signals are unlabeled. Create a new one.
+                        cluster_dict[new_cluster_label] = [row, col]
+                        previously_clustered_signals[row] = {new_cluster_label}
+                        previously_clustered_signals[col] = {new_cluster_label}
+                        # print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
+                        new_cluster_label += 1
+                    else:
+                        # Row isn't labeled but col is; add row to all of col's clusters.
+                        # print("adding", row, "to clusters", previously_clustered_signals[col])
+                        # row is not already in a cluster, add it to col's set of clusters
+                        for label in previously_clustered_signals[col]:
+                            cluster_dict[label].append(row)
+                        previously_clustered_signals[row] = previously_clustered_signals[col]
+                else:
+                    # Check if the current col signal is currently unlabeled
+                    if col not in previously_clustered_signals.keys():
+                        # Row if labeled but col is not; add col to row's set of clusters
+                        # print("adding", col, "to clusters", previously_clustered_signals[row])
+                        for label in previously_clustered_signals[row]:
+                            cluster_dict[label].append(col)
+                        previously_clustered_signals[col] = previously_clustered_signals[row]
+                    # Both signals are already labeled
+                    else:
+                        # Check if we're using fuzzy labeling (a signal can belong to multiple clusters).
+                        # If so, check if the union of both sets of labels is the empty set. If so, this is a
+                        # relationship that hasn't already been captures by an existing cluster. Make a new one.
+                        if fuzzy_labeling:
+                            row_label_set = previously_clustered_signals[row]
+                            col_label_set = previously_clustered_signals[col]
+                            if not row_label_set & col_label_set:
+                                cluster_dict[new_cluster_label] = [row, col]
+                                previously_clustered_signals[row] = {new_cluster_label} | row_label_set
+                                previously_clustered_signals[col] = {new_cluster_label} | col_label_set
+                                # print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
+                                new_cluster_label += 1
+                            else:
+                                # We're using fuzzy labeling and these two signals represent a 'bridge' between two
+                                # signal clusters. Fold col into row's clusters and delete col's unique cluster indices.
+                                for label in row_label_set - col_label_set:
+                                    cluster_dict[label].append(col)
+                                previously_clustered_signals[col] = row_label_set | col_label_set
+                                for label in col_label_set - row_label_set:
+                                    cluster_dict[label].append(row)
+                                previously_clustered_signals[row] = row_label_set | col_label_set
+                        # print(row, col, "already in cluster_dict", previously_clustered_signals[row], "&",
+                        #       previously_clustered_signals[col])
+
+    # Delete any duplicate clusters
+    cluster_sets = []
+    deletion_labels = []
+    for label, cluster in cluster_dict.items():
+        this_set = set(cluster)
+        if this_set in cluster_sets:
+            deletion_labels.append(label)
+        else:
+            cluster_sets.append(this_set)
+    for label in deletion_labels:
+        del cluster_dict[label]
+
+    return cluster_dict
+
+
+# NOTE: This method has a LOT of redundancy with the subset_selection, signal_correlation, and greedy_signal_clustering
+# logic. If you know you always want to perform label propagation, it would be more efficient to incorporate it directly
+# into those functions. Since this code base is more of a Proof of Concept, label propagation is deliberately pulled
+# out as a distinct method to make the pipeline steps as distinct as possible.
+def label_propagation(a_timer:                          PipelineTimer,
+                      pickle_clusters_filename:         str = '',
+                      pickle_all_signals_df_filename:   str = '',
+                      csv_signals_correlation_filename: str = '',
+                      signal_dict:                      dict = None,
+                      cluster_dict:                     dict = None,
+                      correlation_threshold:            float = 0.8,
+                      force:                            bool = False):
+    if path.isfile(pickle_all_signals_df_filename) and path.isfile(csv_signals_correlation_filename):
+        if force:
+            # Remove any existing data.
+            remove(pickle_all_signals_df_filename)
+            remove(csv_signals_correlation_filename)
+            remove(pickle_clusters_filename)
+        else:
+            print("\nA DataFrame and correlation matrix for label propagation appears to exist and forcing is turned "
+                  "off. Using " + pickle_all_signals_df_filename + ", " + csv_signals_correlation_filename + ", and "
+                  + pickle_clusters_filename)
+            return [load(open(pickle_all_signals_df_filename, "rb")),
+                    read_csv(csv_signals_correlation_filename, index_col=0).rename(index=literal_eval,
+                                                                                   columns=literal_eval),
+                    load(open(pickle_clusters_filename, "rb"))]
+
+    a_timer.start_function_time()
+
+    non_static_signals_dict = {}
+    largest_index = []
+    df_columns = []
+
+    # Put all non-static signals into one DataFrame. Re-index all of them to share the same index.
+    for k_arb_id, arb_id_signals in signal_dict.items():
+        for k_signal_id, signal in arb_id_signals.items():
+            if not signal.static:
+                non_static_signals_dict[k_signal_id] = signal
+                df_columns.append(k_signal_id)
+                if signal.time_series.__len__() > largest_index.__len__():
+                    largest_index = signal.time_series.index
+
+    df: DataFrame = DataFrame(zeros((largest_index.__len__(), df_columns.__len__())),
+                              columns=df_columns,
+                              index=largest_index)
+
+    for k_signal_id, signal in non_static_signals_dict.items():
+        df[k_signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
+
+    # Calculate the correlation matrix for this DataFrame of all non-static signals.
+    correlation_matrix = df.corr()
+
+    # Re-run the algorithm from greedy_signal_clustering but omitting the logic for creating new clusters.
+    # This effectively propagates the labels generated by the subset of signals with the largest Shannon Index values
+    # to any correlated signals which were not part of that subset.
+    correlation_keys = correlation_matrix.columns.values
+    previously_clustered_signals = {}
+    for k_cluster_id, cluster in cluster_dict.items():
+        for k_signal_id in cluster:
+            previously_clustered_signals[k_signal_id] = k_cluster_id
+
+    for n, row in enumerate(correlation_keys):
+        for m, col in enumerate(correlation_keys):
+            if n == m:
+                # this is a diagonal on the correlation matrix. Skip it.
+                continue
+            # I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
+            result = round(correlation_matrix.iloc[n, m], 2)
+
+            # check if this is a significant correlation according to our heuristic threshold.
+            if result >= correlation_threshold:
+                # if row signal is already a member of a cluster
+                if row in previously_clustered_signals.keys():
+                    # if col signal is already a member of a cluster
+                    if col in previously_clustered_signals.keys():
+                        # print(row, col, "already in clusters", previously_clustered_signals[row], "&",
+                        #       previously_clustered_signals[col])
+                        continue
+                    # if col is not already in a cluster, add it to row's cluster
+                    else:
+                        # print("adding", col, "to cluster", clusters[previously_clustered_signals[row]])
+                        cluster_dict[previously_clustered_signals[row]].append(col)
+                        previously_clustered_signals[col] = previously_clustered_signals[row]
+                # row signal hasn't been added to a cluster
+                else:
+                    # if col signal is already a member of a cluster
+                    if col in previously_clustered_signals.keys():
+                        # print("adding", row, "to cluster", clusters[previously_clustered_signals[col]])
+                        # row is not already in a cluster, add it to col's cluster
+                        cluster_dict[previously_clustered_signals[col]].append(row)
+                        previously_clustered_signals[row] = previously_clustered_signals[col]
+
+    a_timer.set_label_propagation()
+
+    df.dropna(axis=0, how='any', inplace=True)
+    df.dropna(axis=1, how='any', inplace=True)
+
+    return df, correlation_matrix, cluster_dict
+
+
+def j1979_signal_labeling(a_timer:               PipelineTimer,
+                          j1979_corr_filename:   str = "",
+                          signal_filename:       str = "",
+                          df_signals:            DataFrame = None,
+                          j1979_dict:            dict = None,
+                          signal_dict:           dict = None,
+                          correlation_threshold: float = 0.8,
+                          force:                 bool = False) -> [dict, DataFrame]:
+    if force:
+        if path.isfile(j1979_corr_filename):
+            remove(j1979_corr_filename)
+        if path.isfile(signal_filename):
+            remove(signal_filename)
+    if path.isfile(j1979_corr_filename):
+        print("\nA J1979 correlation matrix for signal labeling appears to exist and forcing is turned off. Using "
+              + j1979_corr_filename + " and the signal dictionary passed to j1979_signal_labeling().")
+        return signal_dict, load(open(j1979_corr_filename, "rb"))
+    # If a signal dictionary has already been pickled, j1979 correlation has not been pickled, and there is J1979 data
+    # in this sample, this implies that the pickled signal dictionary does not include the appropriate J1979 tags on the
+    # Signal objects. Delete that pickled dictionary, tag Signals in signal_dict, and then re-pickle that dictionary in
+    # Sample.py.
+    elif path.isfile(signal_filename) and j1979_dict:
+        if path.isfile(signal_filename):
+            remove(signal_filename)
+
+    latest_start_index = 0.0
+    earliest_end_index = 99999999999999.9
+    df_columns = []
+
+    for pid, pid_data in j1979_dict.items():  # type: int, J1979
+        if latest_start_index < pid_data.data.index[0]:
+            latest_start_index = pid_data.data.index[0]
+        if earliest_end_index > pid_data.data.index[-1]:
+            earliest_end_index = pid_data.data.index[-1]
+        df_columns.append(pid_data.title)
+
+    df_signals = df_signals.loc[latest_start_index:earliest_end_index]  # type: DataFrame
+
+    df_j1979: DataFrame = DataFrame(zeros((df_signals.shape[0], df_columns.__len__())),
+                                    columns=df_columns,
+                                    index=df_signals.index)
+
+    for pid, pid_data in j1979_dict.items():  # type: int, J1979
+        df_j1979[pid_data.title] = pid_data.data.reindex(index=df_signals.index, method='nearest')
+
+    df_combined = concat([df_signals, df_j1979], axis=1)
+
+    correlation_matrix = df_combined.corr()
+
+    correlation_matrix.dropna(axis=1, how='all', inplace=True)
+    correlation_matrix.dropna(axis=0, how='all', inplace=True)
+
+    # Just consider the J1979 column correlations. Slice off the identity rows at the end of the DataFrame as well.
+    for index, row in correlation_matrix[df_columns][:-len(df_columns)].iterrows():
+        row = abs(row)
+        max_index = row.idxmax(axis=1, skipna=True)
+        if row[max_index] >= correlation_threshold:
+            # Index is the tuple identifying the signal (Arb ID, Start Index, Stop Index); signal_dict keyed on Arb ID
+            signal = signal_dict[index[0]][index]  # type: Signal
+            # print("Adding J1979 attribute " + max_index + " to signal ID " + str(index))
+            signal.j1979_title = max_index
+            signal.j1979_pcc = row[max_index]
+            signal_dict[index[0]][index] = signal
+
+            # print(i, index, row[max_index], max_index, row.values)
+
+    return signal_dict, correlation_matrix[df_columns][:-len(df_columns)]
+
+    # correlation_matrix.to_csv('j1979_correlation.csv')
--- a/Pipeline_multi-file/Signal.py
+++ b/Pipeline_multi-file/Signal.py
@ -0,0 +1,43 @@
+from pandas import Series
+from math import log10
+
+
+class Signal:
+    def __init__(self, arb_id: int, start_index: int, stop_index: int):
+        self.arb_id:        int = arb_id
+        self.start_index:   int = start_index
+        self.stop_index:    int = stop_index
+        self.time_series:   Series = None
+        self.static:        bool = True
+        self.shannon_index: float = 0
+        self.plot_title:    str = ""
+        self.j1979_title:   str = None
+        self.j1979_pcc:     float = 0
+
+    def normalize_and_set_metadata(self, normalize_strategy):
+        self.set_shannon_index()
+        self.update_static()
+        self.set_plot_title()
+        self.normalize(normalize_strategy)
+
+    def set_shannon_index(self):
+        si: float = 0.0
+        n: int = self.time_series.__len__()
+        for count in self.time_series.value_counts():
+            # calculate proportion of this integer value in the total population of values
+            p_i = count / n
+            # calculate the Shannon Index (si) given p_i of this value.
+            si += p_i * log10(p_i)
+        si *= -1
+        self.shannon_index = si
+
+    def update_static(self):
+        if self.shannon_index >= .000001:
+            self.static = False
+
+    def set_plot_title(self):
+        self.plot_title = "Time Series from Bit Positions " + str(self.start_index) + " to " + str(self.stop_index) + \
+                          " of Arb ID " + hex(int(self.arb_id))
+
+    def normalize(self, normalize_strategy):
+        normalize_strategy(self.time_series.values, copy=False)
--- a/Pipeline_multi-file/Validator.py
+++ b/Pipeline_multi-file/Validator.py
@ -0,0 +1,105 @@
+from LexicalAnalysis import get_composition_just_tang, merge_tokens_just_composition
+from sklearn.model_selection import KFold
+from ArbID import ArbID
+from numpy import arange, ndarray, zeros, float16, add, divide, argmax, unravel_index
+
+
+# Threshold parameters used during tokenization.
+tokenization_bit_distance:  float = 0.2
+tokenize_padding:           bool = True
+
+
+def alignment_score(mismatch: int, bit_width: int):
+    # Alignment Score: 1 - (union-intersection) / (n-1)
+    # The range of Alignment Score is 0 (all mismatch) to 1 (all match)
+    return float16(1 - mismatch / (bit_width - 1))
+
+
+def borders(token: tuple, last_index: int):
+    if not token[0] == 0:
+        if not token[1] == last_index:
+            return [token[0]-1, token[1]]
+        else:
+            return [token[0]-1]
+    else:
+        if not token[1] == last_index:
+            return [token[1]]
+    return []
+
+
+def train_test_alignment_score(tang_a: ndarray, tang_b: ndarray, max_inversion: float, max_merge: float):
+    # Note: padding is used as a dummy variable to maintain compatibility with the primary pipeline get_composition()
+    comp_a, padding = get_composition_just_tang(tang_a, include_padding=True, max_inversion_distance=max_inversion)
+    comp_a = merge_tokens_just_composition(tokens=comp_a, this_tang=tang_a, max_distance=max_merge)
+    comp_b, padding = get_composition_just_tang(tang_b, include_padding=True, max_inversion_distance=max_inversion)
+    comp_b = merge_tokens_just_composition(tokens=comp_b, this_tang=tang_b, max_distance=max_merge)
+
+    bit_width = len(tang_a)
+    id_a_borders = []
+    id_b_borders = []
+
+    for token in comp_a:
+        # print("\nToken:", token)
+        # result = borders(token, bit_width-1)
+        # print("\tBorders:", result)
+        id_a_borders.extend(borders(token, bit_width-1))
+    for token in comp_b:
+        # print("\nToken:", token)
+        # result = borders(token, bit_width - 1)
+        # print("\tBorders:", result)
+        id_b_borders.extend(borders(token, bit_width-1))
+
+    id_a_borders = set(id_a_borders)
+    id_b_borders = set(id_b_borders)
+    # Mismatch set is the symmetric difference: borders where there was a border in only one of the two payloads.
+    mismatch_set = id_a_borders.union(id_b_borders) - id_a_borders.intersection(id_b_borders)
+
+    return alignment_score(len(mismatch_set), bit_width)
+
+
+class Validator:
+    def __init__(self,
+                 use_j1979:     bool = False,
+                 fold_n:        int = 5):
+        self.use_j1979:     bool = use_j1979
+        self.fold_n:        int = fold_n
+
+    @staticmethod
+    # This function allows for pickling just the score matrix and re-creating a Sample object using it later.
+    def set_lex_threshold_parameters(sample):
+        if sample.avg_score_matrix.shape[0] > 1:
+            optimal_setting = argmax(sample.avg_score_matrix)
+            optimal_setting = unravel_index(optimal_setting, sample.avg_score_matrix.shape)
+            sample.optimal_bit_dist = optimal_setting[0]
+            sample.optimal_merge_dist = optimal_setting[1]
+        else:
+            print("\nSet lex threshold parameters was improperly called for sample " + sample.output_vehicle_dir)
+
+    def k_fold_lex_threshold_selection(self, id_dict: dict, sample):
+        list_of_inversion_values = arange(0, 1.01, 0.01)
+        list_of_merge_values = arange(0, 1.01, 0.01)
+        sample.avg_score_matrix = zeros((len(list_of_inversion_values), len(list_of_merge_values)), dtype=float16)
+        number_of_ids_scored: int = 0
+
+        for id_label, arb_id in id_dict.items():  # type: int, ArbID
+            if arb_id.static or arb_id.short:
+                continue
+            this_id_avg_score_matrix = zeros((len(list_of_inversion_values), len(list_of_merge_values)), dtype=float16)
+
+            print("\tID:", id_label, "\tnumber of observed payloads:", arb_id.original_data.shape[0])
+
+            kf = KFold(n_splits=self.fold_n)
+            for k, (train, test) in enumerate(kf.split(arb_id.boolean_matrix)):
+                score_matrix = zeros((len(list_of_inversion_values), len(list_of_merge_values)), dtype=float16)
+                train_tang = arb_id.generate_tang(boolean_matrix=arb_id.boolean_matrix[train])
+                test_tang = arb_id.generate_tang(boolean_matrix=arb_id.boolean_matrix[test])
+
+                for m, i in enumerate(list_of_inversion_values):
+                    for n, j in enumerate(list_of_merge_values):
+                        score_matrix[m, n] = train_test_alignment_score(train_tang, test_tang, i, j)
+                this_id_avg_score_matrix = add(this_id_avg_score_matrix, score_matrix)
+            this_id_avg_score_matrix = divide(this_id_avg_score_matrix, self.fold_n)
+            sample.avg_score_matrix = add(sample.avg_score_matrix, this_id_avg_score_matrix)
+            number_of_ids_scored += 1
+        sample.avg_score_matrix = divide(sample.avg_score_matrix, number_of_ids_scored)
+        self.set_lex_threshold_parameters(sample)
--- a/Pipeline_multi-file/maximize_sum_shannon.py
+++ b/Pipeline_multi-file/maximize_sum_shannon.py
@ -0,0 +1,81 @@
+from numpy import arange, ndarray, zeros, concatenate, uint64
+from math import log10
+from pandas import Series, concat
+
+
+def shannon_index(X: Series):
+    H: float = 0.0
+    n: int = X.shape[0]
+    for count in X.value_counts():
+        # calculate proportion of this integer value in the total population of values
+        p_i = count / n
+        # calculate the Shannon Index (H) given p_i of this value.
+        H += p_i * log10(p_i)
+    H *= -1
+    return H
+
+
+def make_binary_matrix(X: Series):
+    boolean_matrix = zeros((X.shape[0], 8), dtype=uint64)
+    for i, row in enumerate(X.iteritems()):
+        if row[1] > 0:
+            # i is the row in the boolean_matrix
+            # j*8 is the left hand bit for this byte in the payload
+            # j*8 + 8 is the right hand bit for this byte in the payload
+            # e.g. byte index 1 starts at bits 1*8 = 8 to 1*8+8 = 16; [8:16]
+            # likewise, byte index 7 starts at bits 7*8 = 56 to 7*8+8 = 64
+            # Numpy indexing is non-inclusive of the upper bound. So [0:8] is the first 8 elements
+            bin_string = format(row[1], '08b')
+            boolean_matrix[i, :] = [x == '1' for x in bin_string]
+    return boolean_matrix
+
+
+def binary_to_int(X: ndarray, tokens: list):
+    signals = {}
+    for token in tokens:
+        # Convert the binary ndarray to a list of string representations of each row
+        temp1 = [''.join(str(x) for x in row) for row in X[:, token[0]:token[1] + 1]]
+        temp2 = zeros((temp1.__len__(), 1), dtype=uint64)
+        # convert each string representation to int
+        for i, row in enumerate(temp1):
+            temp2[i] = int(row, 2)
+
+        # create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
+        signal = Series(temp2[:, 0])
+        signals[token] = signal
+    return signals
+
+
+x1 = Series(arange(0, 32, 1))
+x1 = concat([x1, x1], axis=0, ignore_index=True)
+x1 = concat([x1, x1], axis=0, ignore_index=True)
+x1 = concat([x1, x1], axis=0, ignore_index=True)
+# x1 = concat([Series(arange(0, 10, 1)), x1], axis=0, ignore_index=True)
+# x1 = concat([x1, x1], axis=0, ignore_index=True)
+x2 = Series(arange(0, 256, 1))
+
+# print(x1)
+# print(shannon_index(x1))
+# print(shannon_index(x2))
+
+x1_bin = make_binary_matrix(x1)
+x2_bin = make_binary_matrix(x2)
+
+# print(x1_bin)
+# print(x2_bin.shape)
+x12_bin = concatenate((x1_bin, x2_bin), axis=1)
+
+lower = 0
+upper = 15
+cutoff = 0
+sum_shannon = {}
+while cutoff < upper:
+    d = binary_to_int(x12_bin, [(0, cutoff), (cutoff+1, 15)])
+    this_sum = 0
+    for k, v in d.items():
+        this_sum += shannon_index(v)
+    sum_shannon[cutoff] = this_sum
+    cutoff += 1
+
+for k, v in sum_shannon.items():
+    print(k, v)
--- a/R/.Rhistory
+++ b/R/.Rhistory
@ -0,0 +1,14 @@
+demo()
+demo(graphics)
+library(rEDM)
+devtools::install_github("ha0ye/rEDM")
+library(devtools)
+library(rEDM)
+setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R")
+speed_brake_rpm <- read.csv("speed_brake_rpm.csv")
+ts_speed <- speed_brake_rpm$speed
+lib_speed <- c(1, 0.5*length(ts_speed))
+pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
+save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+plot(simplex_output_speed$tp, simplex_output_speed$rho, type = "l", xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)")
--- a/R/CCM_speed_brake_rpm.R
+++ b/R/CCM_speed_brake_rpm.R
@ -0,0 +1,6 @@
+for (ccm_from in vars) { 
+  for (ccm_to in vars[vars != ccm_from]) { 
+    out_temp <- ccm(speed_brake_rpm, E = 9, lib_column = ccm_from, target_column = ccm_to, lib_sizes = n, replace = FALSE, silent = TRUE) 
+    ccm_rho_matrix[ccm_from, ccm_to] <- out_temp$rho
+  } 
+}
--- a/R/brake_data.csv
+++ b/R/brake_data.csv
--- a/R/brake_xmap_rpm.Rda
+++ b/R/brake_xmap_rpm.Rda
--- a/R/brake_xmap_rpm_v2.Rda
+++ b/R/brake_xmap_rpm_v2.Rda
--- a/R/brake_xmap_speed.Rda
+++ b/R/brake_xmap_speed.Rda
--- a/R/brake_xmap_speed_v2.Rda
+++ b/R/brake_xmap_speed_v2.Rda
--- a/R/city/.Rhistory
+++ b/R/city/.Rhistory
@ -0,0 +1,167 @@
+demo()
+demo(graphics)
+library(rEDM)
+devtools::install_github("ha0ye/rEDM")
+library(devtools)
+libary(rEDM)
+package(rEDM)
+library(rEDM)
+setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R/city")
+speed_brake_rpm <- read.csv("speed_brake_rpm_city.csv")
+load(smap_output_brake.RDA)
+load(smap_output_brake.Rda)
+load(file = "smap_output_brake.Rda")
+View(smap_output_brake)
+View(speed_brake_rpm)
+lib <- c(1, 0.5*length(speed_brake_rpm))
+pred <- c(0.5*length(speed_brake_rpm)+1, length(speed_brake_rpm))
+len(speed_brake_rpm)
+data(speed_brake_rpm)
+length(speed_brake_rpm)
+lib <- c(1, 0.5*22792)
+pred <- c(0.5*22792+1, 22792)
+cols <- c("speed", "rpm", "brake")
+target <- "brake"
+block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
+save(block_smap_output, file = "brake_block_smap_output.Rda")
+smap_coeffs <- block_smap_output$smap_coefficients[[1]]
+str(smap_coeffs)
+predictions <- block_smap_output$model_output[[1]] t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
+lines(t, predictions$pred, lty = 2)
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
+plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
+plot(t, smap_coeffs[, 3], type = "l", col = "magenta", ylab = "effect of lags", xlab = "")
+predictions <- block_smap_output$model_output[[1]]
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
+lines(t, predictions$pred, lty = 2)
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
+plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
+plot(t, smap_coeffs[, 3], type = "l", col = "magenta", ylab = "effect of lags", xlab = "")
+cols = c("brake", "speed", "rpm")
+block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
+save(block_smap_output, file = "brake_block_smap_output.Rda")
+smap_coeffs <- block_smap_output$smap_coefficients[[1]]
+str(smap_coeffs)
+predictions <- block_smap_output$model_output[[1]]
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
+lines(t, predictions$pred, lty = 2)
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of lags", xlab = "")
+plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of speed", xlab = "")
+plot(t, smap_coeffs[, 3], type = "l", col = "magenta", ylab = "effect of rpm", xlab = "")
+cols = c("speed", "rpm")
+block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
+smap_coeffs <- block_smap_output$smap_coefficients[[1]]
+str(smap_coeffs)
+predictions <- block_smap_output$model_output[[1]]
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
+lines(t, predictions$pred, lty = 2)
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
+plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
+list_of_model_predictions <- block_smap_output$model_output
+first_data_frame_of_predictions <- list_of_model_predictions[[1]]
+observed <- first_data_frame_of_predictions$obs
+predicted <- first_data_frame_of_predictions$pred
+plot_range <- range(c(observed, predicted), na.rm = TRUE)
+plot(observed, predicted, xlim = plot_range, ylim = plot_range, xlab = "Observed", ylab = "Predicted", asp = 1)
+abline(a = 0, b = 1, lty = 2, col = "blue")
+View(block_smap_output)
+View(block_smap_output)
+View(first_data_frame_of_predictions)
+View(first_data_frame_of_predictions)
+View(block_smap_output)
+str(block_smap_output.rmse)
+str(block_smap_output$rmse)
+data(block_3sp)
+View(block_3sp)
+block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", tp = 300, theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
+smap_coeffs <- block_smap_output$smap_coefficients[[1]]
+str(smap_coeffs)
+predictions <- block_smap_output$model_output[[1]]
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
+lines(t, predictions$pred, lty = 2)
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
+plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
+View(block_smap_output)
+list_of_model_predictions <- block_smap_output$model_output
+first_data_frame_of_predictions <- list_of_model_predictions[[1]]
+observed <- first_data_frame_of_predictions$obs
+predicted <- first_data_frame_of_predictions$pred
+plot_range <- range(c(observed, predicted), na.rm = TRUE)
+plot(observed, predicted, xlim = plot_range, ylim = plot_range, xlab = "Observed", ylab = "Predicted", asp = 1)
+abline(a = 0, b = 1, lty = 2, col = "blue")
+load(file = "ccm_optimal_tp.Rda")
+View(output_df)
+View(output_df)
+ccm_speed_xmap_brake_neg400 = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = "speed", target_column = "brake", tp = -400, random_libs = FALSE, num_neighbors = "E+1", silent = TRUE, stats_only=FALSE, first_column_time = TRUE)
+ccm_speed_xmap_brake_neg400 = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = "speed", target_column = "brake", tp = -400, random_libs = FALSE, num_neighbors = "E+1", silent = TRUE, first_column_time = TRUE)
+View(ccm_speed_xmap_brake_neg400)
+View(smap_output_brake)
+output_df_means <- ccm_means(output_df)
+warnings()
+View(output_df_means)
+output_brake <- output_df[11]
+View(output_brake)
+output_brake <- output_df[[11]]
+View(output_df)
+output_brake <- ccm_means(output_df[11])
+output_brake <- ccm_means(output_df[[11]])
+output_df[[11]]
+output_df[11]
+output_df[11,]
+output_brake <- ccm_means(output_df[11,])
+output_brake <- output_df[11,]
+View(output_brake)
+output_brake <- output_brake[,-1]
+output_brake <- output_df[11,]
+output_brake <- output_brake[,-1]
+output_brake <- output_df[11,]
+output_brake <- output_brake[,-12]
+output_brake <- ccm_means(output_brake)
+View(output_brake)
+View(ccm_speed_xmap_brake_neg400)
+output_brake <- ccm_means(ccm_speed_xmap_brake_neg400)
+View(output_brake)
+View(ccm_speed_xmap_brake_neg400)
+brake_observed <- compute_stats
+View(brake_observed)
+compute_stats(ccm_speed_xmap_brake_neg400)
+compute_stats(speed_brake_rpm$brake, ccm_speed_xmap_brake_neg400)
+View(ccm_speed_xmap_brake_neg400)
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", P = 0.5, E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", P = 0.5, E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE, k = "all")
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", P = 0.5, E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = 1, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
+View(speed_brake_rpm)
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = 1, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, first_column_time = TRUE, silent = TRUE)
+brake_multiview <- multiview(speed_brake_rpm, target_column = "brake", stats_only = FALSE, first_column_time = TRUE, silent = TRUE)
+View(brake_multiview)
+predictions <- brake_multiview$model_output[[1]]
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
+lines(t, predictions$pred, lty = 2)
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+save(brake_multiview, file = "brake_multiview.Rda")
+View(brake_multiview)
+brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
+brake_multiview <- multiview(speed_brake_rpm, E = 9, tau = 1, tp = -300, max_lag = 3, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, silent = TRUE)
+brake_multiview <- multiview(speed_brake_rpm, tau = 1, tp = -300, max_lag = 3, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, silent = TRUE)
+View(brake_multiview)
+load(file = "brake_multiview.Rda")
+multiview_output <- brake_multiview$model_output
+View(multiview_output)
+multiview_output <- multiview_output[1]
+View(multiview_output)
+multiview_output <- multiview_output[[1]]
+View(multiview_output)
+write.csv(multiview_output, file = "multiview_brake.csv")
--- a/R/city/brake_block_smap_output.Rda
+++ b/R/city/brake_block_smap_output.Rda
--- a/R/city/brake_data_city.csv
+++ b/R/city/brake_data_city.csv
--- a/R/city/brake_multiview.Rda
+++ b/R/city/brake_multiview.Rda
--- a/R/city/brake_xmap_rpm.Rda
+++ b/R/city/brake_xmap_rpm.Rda
--- a/R/city/brake_xmap_speed.Rda
+++ b/R/city/brake_xmap_speed.Rda
--- a/R/city/ccm_one_row_city.pdf
+++ b/R/city/ccm_one_row_city.pdf
--- a/R/city/ccm_optimal_tp.Rda
+++ b/R/city/ccm_optimal_tp.Rda
--- a/R/city/ccm_tp.pdf
+++ b/R/city/ccm_tp.pdf
--- a/R/city/commands_list_city.txt
+++ b/R/city/commands_list_city.txt
@ -0,0 +1,317 @@
+par() features
+mar
+A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1
+
+oma
+A vector of the form c(bottom, left, top, right) giving the size of the outer margins in lines of text.
+
+mgp
+The margin line (in mex units) for the axis title, axis labels and axis line. Note that mgp[1] affects title whereas mgp[2:3] affect axis. The default is c(3, 1, 0).
+
+
+library(rEDM)
+setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R/city")
+speed_brake_rpm <- read.csv("speed_brake_rpm_city.csv")
+
+# CDN: setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R/city")
+
+
+===============================================================================================================================================================================
+		1. Perform Simplex Projection to determine best embedding dimension
+===============================================================================================================================================================================
+
+ts_speed <- speed_brake_rpm$speed
+lib_speed <- c(1, 0.5*length(ts_speed))
+pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
+
+ts_rpm <- speed_brake_rpm$rpm
+lib_rpm <- c(1, 0.5*length(ts_rpm))
+pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
+
+ts_brake <- speed_brake_rpm$brake
+lib_brake <- c(1, 0.5*length(ts_brake))
+pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
+
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
+
+save(simplex_output_speed, file = "simplex_output_speed.Rda")
+save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
+save(simplex_output_brake, file = "simplex_output_brake.Rda")
+
+# If instead you're loading previous calculations....
+load(file = "simplex_output_speed.Rda")
+load(file = "simplex_output_rpm.Rda")
+load(file = "simplex_output_brake.Rda")
+
+===============================================================================================================================================================================
+		1a. Plot and Save the Embedding Results
+===============================================================================================================================================================================
+
+par(mar = c(4, 4, 1, 1), mgp = c(2.5, 1, 0)) # set margins for plotting
+
+# create a 1-row, 3-column plot matrix (pg. 260 of R-cookbook)
+par(mfrow=c(1,3), cex=1.5)
+
+plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", xlab = "Embedding Dimension (E)", ylab = expression(paste("Forecast Skill (", rho, ")")), main="Vehicle Speed")
+plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", xlab = "Embedding Dimension (E)", main="Engine RPM")
+plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", xlab = "Embedding Dimension (E)", main="Brake Pressure")
+
+# IF YOU WANT TO PLOT ALL THREE ON ONE LINE DIRECTLY TO A 5.5" x 2.5# PDF
+# mgp affects (axis label offset, axis number offeset, axis line offset). Resize the window in RStudio in order to resize the plot output when using the Export button.
+pdf(file = "simplex_embedding_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0.99, 1.0), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.99, 0.995, 1.0), labels = c(0.99, 0.995, 1.0))
+plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", ylim = c(0.997, 1.0), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.997, 0.9985, 1.0), labels = c(0.997, 0.9985, 1.0))
+plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0.5, 0.9), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.5, 0.7, 0.9), labels = c(0.5, 0.7, 0.9))
+title(xlab = "Embedding Dimension (E)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+
+===============================================================================================================================================================================
+		2. Re-run Simplex Projection with Optimal E to determine whether chaotic noise appears (Rho drops quickly with respect to increased tp)
+===============================================================================================================================================================================
+
+# Be sure to reload simplex_output_X using the E=9 computations before the next round of plots
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
+save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, E = 9, tp = 1:1000)
+save(simplex_output_rpm, file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
+
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, E = 9, tp = 1:1000)
+save(simplex_output_brake, file = "simplex_output_brake_E_9_tp_1-1000.Rda")
+
+# If instead you're loading previous calculations....
+load(file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+load(file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
+load(file = "simplex_output_brake_E_9_tp_1-1000.Rda")
+
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "simplex_forecasting_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(simplex_output_speed$tp, pmax(0, simplex_output_speed$rho), type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0, 1.0))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
+plot(simplex_output_rpm$tp, pmax(0, simplex_output_rpm$rho), type = "l", axes = FALSE, main="Engine RPM", ylim = c(0, 1.0))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
+plot(simplex_output_brake$tp, pmax(0, simplex_output_brake$rho), type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0, 1.0))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
+title(xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+===============================================================================================================================================================================
+					2. Distinguish between red noise and nonlinear deterministic behavior using S-maps
+===============================================================================================================================================================================
+
+smap_output_speed <- s_map(ts_speed, lib_speed, pred_speed, E = 9, silent = TRUE)
+smap_output_rpm <- s_map(ts_rpm, lib_rpm, pred_rpm, E = 9, silent = TRUE)
+smap_output_brake <- s_map(ts_brake, lib_brake, pred_brake, E = 9, silent = TRUE)
+
+save(smap_output_speed, file = "smap_output_speed.Rda")
+save(smap_output_rpm, file = "smap_output_rpm.Rda")
+save(smap_output_brake, file = "smap_output_brake.Rda")
+
+load(file = "smap_output_rpm.Rda")
+load(file = "smap_output_speed.Rda")
+load(file = "smap_output_brake.Rda")
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "smap_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", xlim=c(0,8), ylim = c(0.99997, 0.999975))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.99997,0.999975), labels = c(0.99997,0.999975))
+plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", xlim=c(0,8), ylim = c(0.999931, 0.999937))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.999931,0.999937), labels = c(0.999931,0.999937))
+plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", xlim=c(0,8), ylim = c(0.986, 0.9885))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.986, 0.986, 0.9885), labels = c(0.986, 0.986, 0.9885))
+title(xlab = "Nonlinear Tuning Parameter (theta)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+
+# To do individual plots
+plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999912,0.9999931))
+title(main = "S-Map Forecast Skill For Vehicle Speed")
+plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.99693,0.99696))
+title(main = "S-Map Forecast Skill For Brake Pressure")
+plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999842,0.9999857))
+title(main = "S-Map Forecast Skill For Engine Rotations Per Minute")
+
+
+===============================================================================================================================================================================
+					3. Perform Convergent Cross Mapping Between Variables Using Optimal Embedding Dimension E
+===============================================================================================================================================================================
+
+# num_samples = 10, lib_sizes = seq(0, 15000, by = 3000) causes 10 models at each of the 5 library sizes
+speed_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_rpm, file = "speed_xmap_rpm.Rda")
+speed_xmap_brake <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_brake, file = "speed_xmap_brake.Rda")
+
+brake_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_rpm, file = "brake_xmap_rpm.Rda")
+brake_xmap_speed <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_speed, file = "brake_xmap_speed.Rda")
+
+rpm_xmap_speed <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_speed, file = "rpm_xmap_speed.Rda")
+rpm_xmap_brake <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_brake, file = "rpm_xmap_brake.Rda")
+
+# X xmap Y: causal effect of (pred) Y onto (lib) X
+
+load(file = "speed_xmap_rpm.Rda")
+load(file = "speed_xmap_brake.Rda")
+load(file = "brake_xmap_rpm.Rda")
+load(file = "brake_xmap_speed.Rda")
+load(file = "rpm_xmap_speed.Rda")
+load(file = "rpm_xmap_brake.Rda")
+
+speed_xmap_rpm_mean <- ccm_means(speed_xmap_rpm)
+speed_xmap_brake_mean <- ccm_means(speed_xmap_brake)
+
+brake_xmap_rpm_mean <- ccm_means(brake_xmap_rpm)
+brake_xmap_speed_mean <- ccm_means(brake_xmap_speed)
+
+rpm_xmap_speed_mean <- ccm_means(rpm_xmap_speed)
+rpm_xmap_brake_mean <- ccm_means(rpm_xmap_brake)
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "ccm_one_row_city.pdf", width = 5.5, height = 2.0, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+
+plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & RPM", xlim = c(3000, 15000), ylim = c(.5, .8))
+lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
+legend(x = "right", legend = c("S xmap R", "R xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
+axis(side = 2, labels = TRUE)
+
+plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Brake & RPM", xlim = c(3000, 15000), ylim = c(.1, .3))
+lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
+legend(x = "topright", legend = c("B xmap R", "R xmap B"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
+axis(side = 2, labels = TRUE)
+
+plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & Brake", xlim = c(3000, 15000), ylim = c(.45, .6))
+lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
+legend(x = "topright", legend = c("S xmap B", "B xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
+axis(side = 2, labels = TRUE)
+
+title(xlab = "Library Size", ylab = "Cross Map Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+
+plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.86, 0.95))
+lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
+legend(x = "topleft", legend = c("Speed xmap Engine RPM", "Engine RPM xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Speed and Engine RPM")
+
+plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.35, 0.48))
+lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
+legend(x = "left", legend = c("Brake Pressure xmap Engine RPM", "Engine RPM xmap Brake Pressure"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Brake Pressure and Engine RPM")
+
+plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.6, 0.9))
+lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
+legend(x = "left", legend = c("Speed xmap Brake Pressure", "Brake Pressure xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Speed and Brake Pressure")
+
+
+
+===============================================================================================================================================================================
+					Extension 1. Look for optimal TP value
+===============================================================================================================================================================================
+vars_city <- names(speed_brake_rpm)[2:4]
+tp_list = seq(-500, 500, 100)
+params_city <- expand.grid(lib_column = vars_city, target_column = vars_city, tp = tp_list)
+params_city <- params_city[params_city$lib_column != params_city$target_column, ]
+params_city$E <- 9
+
+output <- list()
+step_size = 10
+
+for (n in 0:floor(NROW(params_city)/step_size)) {
+    gc()
+    for (i in (n*step_size+1):(min((n+1)*step_size, NROW(params_city)))) {
+        output[[i]] = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = params_city$lib_column[i], target_column = params_city$target_column[i], tp = params_city$tp[i], random_libs = FALSE, num_neighbors = "E+1", silent = TRUE)
+    }
+}
+
+library(data.table)
+output_df = rbindlist(output)
+output_df$direction <- paste(output_df$lib_column, "xmap to\n", output_df$target_column)
+save(output_df, file = "ccm_optimal_tp.Rda")
+
+library(ggplot2)
+time_delay_ccm_fig <- ggplot(output_df, aes(x = tp, y = rho, color = direction)) + geom_line() + theme_bw() + scale_x_continuous(breaks=seq(-500, 500, 100))
+print(time_delay_ccm_fig)
+ggsave(filename = "ccm_tp.pdf", plot = time_delay_ccm_fig, device = "pdf", width = 5.5, height = 2.5, units = "in", dpi = "print")
+
+
+
+===============================================================================================================================================================================
+					Extension 2. Model Brake using S-map of Brake, Speed, and RPM
+===============================================================================================================================================================================
+lib <- c(1, 0.5*22792)
+pred <- c(0.5*22792+1, 22792)
+cols <- c("speed", "rpm", "brake")
+target <- "brake"
+
+block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
+
+save(block_smap_output, file = "brake_block_smap_output.Rda")
+
+smap_coeffs <- block_smap_output$smap_coefficients[[1]]
+str(smap_coeffs)
+
+predictions <- block_smap_output$model_output[[1]] 
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "") 
+lines(t, predictions$pred, lty = 2) 
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
+plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
+plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
+
+
+list_of_model_predictions <- block_smap_output$model_output
+first_data_frame_of_predictions <- list_of_model_predictions[[1]]
+observed <- first_data_frame_of_predictions$obs 
+predicted <- first_data_frame_of_predictions$pred
+
+plot_range <- range(c(observed, predicted), na.rm = TRUE) 
+plot(observed, predicted, xlim = plot_range, ylim = plot_range, xlab = "Observed", ylab = "Predicted", asp = 1)
+abline(a = 0, b = 1, lty = 2, col = "blue")
+
+===============================================================================================================================================================================
+					Extension 3. Model Brake using CCM from Speed
+===============================================================================================================================================================================
+ccm_speed_xmap_brake_neg400 = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = "speed", target_column = "brake", tp = -400, random_libs = FALSE, num_neighbors = "E+1", silent = TRUE, stats_only=FALSE, first_column_time = TRUE)
+
+
+brake_multiview <- multiview(speed_brake_rpm, E = 9, tau = 1, tp = -300, max_lag = 3, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, silent = TRUE)
+
+predictions <- brake_multiview$model_output[[1]] 
+t <- predictions$time
+plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "") 
+lines(t, predictions$pred, lty = 2) 
+legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
--- a/R/city/last.dump.rda
+++ b/R/city/last.dump.rda
--- a/R/city/multiview_brake.csv
+++ b/R/city/multiview_brake.csv
--- a/R/city/rpm_data_city.csv
+++ b/R/city/rpm_data_city.csv
--- a/R/city/rpm_xmap_brake.Rda
+++ b/R/city/rpm_xmap_brake.Rda
--- a/R/city/rpm_xmap_speed.Rda
+++ b/R/city/rpm_xmap_speed.Rda
--- a/R/city/simplex_embedding_one_row_city.pdf
+++ b/R/city/simplex_embedding_one_row_city.pdf
--- a/R/city/simplex_forecasting_one_row_city.pdf
+++ b/R/city/simplex_forecasting_one_row_city.pdf
--- a/R/city/simplex_output_brake.Rda
+++ b/R/city/simplex_output_brake.Rda
--- a/R/city/simplex_output_brake_E_9_tp_1-1000.Rda
+++ b/R/city/simplex_output_brake_E_9_tp_1-1000.Rda
--- a/R/city/simplex_output_rpm.Rda
+++ b/R/city/simplex_output_rpm.Rda
--- a/R/city/simplex_output_rpm_E_9_tp_1-1000.Rda
+++ b/R/city/simplex_output_rpm_E_9_tp_1-1000.Rda
--- a/R/city/simplex_output_speed.Rda
+++ b/R/city/simplex_output_speed.Rda
--- a/R/city/simplex_output_speed_E_9_tp_1-1000.Rda
+++ b/R/city/simplex_output_speed_E_9_tp_1-1000.Rda
--- a/R/city/smap_one_row_city.pdf
+++ b/R/city/smap_one_row_city.pdf
--- a/R/city/smap_output_brake.Rda
+++ b/R/city/smap_output_brake.Rda
--- a/R/city/smap_output_rpm.Rda
+++ b/R/city/smap_output_rpm.Rda
--- a/R/city/smap_output_speed.Rda
+++ b/R/city/smap_output_speed.Rda
--- a/R/city/speed_brake_rpm_city.csv
+++ b/R/city/speed_brake_rpm_city.csv
--- a/R/city/speed_data_city.csv
+++ b/R/city/speed_data_city.csv
--- a/R/city/speed_xmap_brake.Rda
+++ b/R/city/speed_xmap_brake.Rda
--- a/R/city/speed_xmap_rpm.Rda
+++ b/R/city/speed_xmap_rpm.Rda
--- a/R/city/xmap_tp.pdf
+++ b/R/city/xmap_tp.pdf
--- a/R/commands_list.txt
+++ b/R/commands_list.txt
@ -0,0 +1,234 @@
+par() features
+mar
+A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1
+
+oma
+A vector of the form c(bottom, left, top, right) giving the size of the outer margins in lines of text.
+
+mgp
+The margin line (in mex units) for the axis title, axis labels and axis line. Note that mgp[1] affects title whereas mgp[2:3] affect axis. The default is c(3, 1, 0).
+
+
+library(rEDM)
+setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R")
+# CDN: setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R")
+speed_brake_rpm <- read.csv("speed_brake_rpm.csv")
+
+===============================================================================================================================================================================
+		1. Perform Simplex Projection to determine best embedding dimension
+===============================================================================================================================================================================
+
+ts_speed <- speed_brake_rpm$speed
+lib_speed <- c(1, 0.5*length(ts_speed))
+pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
+
+ts_rpm <- speed_brake_rpm$rpm
+lib_rpm <- c(1, 0.5*length(ts_rpm))
+pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
+
+ts_brake <- speed_brake_rpm$brake
+lib_brake <- c(1, 0.5*length(ts_brake))
+pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
+
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
+
+save(simplex_output_speed, file = "simplex_output_speed.Rda")
+save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
+save(simplex_output_brake, file = "simplex_output_brake.Rda")
+
+# If instead you're loading previous calculations....
+load(file = "simplex_output_speed.Rda")
+load(file = "simplex_output_rpm.Rda")
+load(file = "simplex_output_brake.Rda")
+
+===============================================================================================================================================================================
+		1a. Plot and Save the Embedding Results
+===============================================================================================================================================================================
+
+par(mar = c(4, 4, 1, 1), mgp = c(2.5, 1, 0)) # set margins for plotting
+
+# create a 1-row, 3-column plot matrix (pg. 260 of R-cookbook)
+par(mfrow=c(1,3), cex=1.5)
+
+plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", xlab = "Embedding Dimension (E)", ylab = expression(paste("Forecast Skill (", rho, ")")), main="Vehicle Speed")
+plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", xlab = "Embedding Dimension (E)", main="Engine RPM")
+plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", xlab = "Embedding Dimension (E)", main="Brake Pressure")
+
+# IF YOU WANT TO PLOT ALL THREE ON ONE LINE DIRECTLY TO A 5.5" x 2.5# PDF
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(5,4,2,1) + 0.1, mar = c(0,0,1,2) + 0.1, mgp=c(2, 1, 0))
+
+
+# mgp affects (axis label offset, axis number offeset, axis line offset). Resize the window in RStudio in order to resize the plot output when using the Export button.
+pdf(file = "simplex_embedding_one_row.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0.9992, 1.0), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, labels = TRUE)
+plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", ylim = c(0.9975, 1.0), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, labels = TRUE)
+plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0.97, 0.985), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, labels = TRUE)
+title(xlab = "Embedding Dimension (E)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+
+===============================================================================================================================================================================
+		2. Re-run Simplex Projection with Optimal E to determine whether chaotic noise appears (Rho drops quickly with respect to increased tp)
+===============================================================================================================================================================================
+
+# Be sure to reload simplex_output_X using the E=9 computations before the next round of plots
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
+save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, E = 9, tp = 1:1000)
+save(simplex_output_rpm, file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
+
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, E = 9, tp = 1:1000)
+save(simplex_output_brake, file = "simplex_output_brake_E_9_tp_1-1000.Rda")
+
+# If instead you're loading previous calculations....
+load(file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+load(file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
+load(file = "simplex_output_brake_E_9_tp_1-1000.Rda")
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "simplex_forecasting_one_row.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(simplex_output_speed$tp, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed")
+axis(side = 1, labels = TRUE)
+axis(side = 2, labels = TRUE)
+plot(simplex_output_rpm$tp, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM")
+axis(side = 1, labels = TRUE)
+axis(side = 2, labels = TRUE)
+plot(simplex_output_brake$tp, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure")
+axis(side = 1, labels = TRUE)
+axis(side = 2, labels = TRUE)
+title(xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+===============================================================================================================================================================================
+					2. Distinguish between red noise and nonlinear deterministic behavior using S-maps
+===============================================================================================================================================================================
+
+smap_output_speed <- s_map(ts_speed, lib_speed, pred_speed, E = 9, silent = TRUE)
+smap_output_rpm <- s_map(ts_rpm, lib_rpm, pred_rpm, E = 9, silent = TRUE)
+smap_output_brake <- s_map(ts_brake, lib_brake, pred_brake, E = 9, silent = TRUE)
+
+load(file = "smap_output_rpm.Rda")
+load(file = "smap_output_speed.Rda")
+load(file = "smap_output_brake.Rda")
+
+smap_output_speed = smap_output[[1]]
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "smap_one_row.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", xlim=c(0,8), ylim = c(0.9999912,0.9999931))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.9999912,0.9999931), labels = c(0.999991,0.999993))
+plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", xlim=c(0,8), ylim = c(0.9999842,0.9999857))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.9999842,0.9999857), labels = c(0.999984,0.999985))
+plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", xlim=c(0,8), ylim = c(0.99693,0.99696))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.99693,0.99696), labels = c(0.99693,0.99696))
+title(xlab = "Nonlinear Tuning Parameter (theta)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+# To do individual plots
+plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999912,0.9999931))
+title(main = "S-Map Forecast Skill For Vehicle Speed")
+plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.99693,0.99696))
+title(main = "S-Map Forecast Skill For Brake Pressure")
+plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999842,0.9999857))
+title(main = "S-Map Forecast Skill For Engine Rotations Per Minute")
+
+
+===============================================================================================================================================================================
+					3. Perform Convergent Cross Mapping Between Variables Using Optimal Embedding Dimension E
+===============================================================================================================================================================================
+
+# num_samples = 10, lib_sizes = seq(0, 15000, by = 3000) causes 10 models at each of the 5 library sizes
+speed_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_rpm, file = "speed_xmap_rpm_v2.Rda")
+speed_xmap_brake <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_brake, file = "speed_xmap_brake_v2.Rda")
+
+brake_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_rpm, file = "brake_xmap_rpm_v2.Rda")
+brake_xmap_speed <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_speed, file = "brake_xmap_speed_v2.Rda")
+
+rpm_xmap_speed <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_speed, file = "rpm_xmap_speed_v2.Rda")
+rpm_xmap_brake <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_brake, file = "rpm_xmap_brake_v2.Rda")
+
+# X xmap Y: causal effect of (pred) Y onto (lib) X
+
+load(file = "speed_xmap_rpm.Rda")
+load(file = "speed_xmap_brake.Rda")
+load(file = "brake_xmap_rpm.Rda")
+load(file = "brake_xmap_speed.Rda")
+load(file = "rpm_xmap_speed.Rda")
+load(file = "rpm_xmap_brake.Rda")
+
+speed_xmap_rpm_mean <- ccm_means(speed_xmap_rpm)
+speed_xmap_brake_mean <- ccm_means(speed_xmap_brake)
+
+brake_xmap_rpm_mean <- ccm_means(brake_xmap_rpm)
+brake_xmap_speed_mean <- ccm_means(brake_xmap_speed)
+
+rpm_xmap_speed_mean <- ccm_means(rpm_xmap_speed)
+rpm_xmap_brake_mean <- ccm_means(rpm_xmap_brake)
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "ccm_one_row.pdf", width = 5.5, height = 2.0, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+
+plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & RPM", xlim = c(5000, 15000), ylim = c(0.86, 0.95))
+lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
+legend(x = "bottomright", legend = c("S xmap R", "R xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(5000, 10000, 15000), labels = c("5k", "10k", "15k"))
+axis(side = 2, at = c(0.86, 0.89, 0.92, 0.95), labels = c(0.86, 0.89, 0.92, 0.95))
+
+plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Brake & RPM", xlim = c(5000, 15000), ylim = c(0.35, 0.49))
+lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
+legend(x = "right", legend = c("B xmap R", "R xmap B"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(5000, 10000, 15000), labels = c("5k", "10k", "15k"))
+axis(side = 2, at = c(0.35, 0.40, 0.45, 0.49), labels = c(0.35, 0.40, 0.45, 0.49))
+
+plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & Brake", xlim = c(5000, 15000), ylim = c(0.6, 0.9))
+lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
+legend(x = "right", legend = c("S xmap B", "B xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(5000, 10000, 15000), labels = c("5k", "10k", "15k"))
+axis(side = 2, at = c(0.6, 0.7, 0.8, 0.9), labels = c(0.6, 0.7, 0.8, 0.9))
+
+title(xlab = "Library Size", ylab = "Cross Map Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+
+plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.86, 0.95))
+lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
+legend(x = "topleft", legend = c("Speed xmap Engine RPM", "Engine RPM xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Speed and Engine RPM")
+
+plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.35, 0.48))
+lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
+legend(x = "left", legend = c("Brake Pressure xmap Engine RPM", "Engine RPM xmap Brake Pressure"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Brake Pressure and Engine RPM")
+
+plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.6, 0.9))
+lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
+legend(x = "left", legend = c("Speed xmap Brake Pressure", "Brake Pressure xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Speed and Brake Pressure")
--- a/R/cross-correlation_speed_rpm.R
+++ b/R/cross-correlation_speed_rpm.R
@ -0,0 +1,7 @@
+for (ccm_from in vars) { 
+  for (ccm_to in vars[vars != ccm_from]) { 
+    cf_temp <- ccf(speed_brake_rpm[, ccm_from], speed_brake_rpm[, ccm_to], type = "correlation", lag.max = 500, plot = FALSE)$acf
+
+    corr_matrix[ccm_from, ccm_to] <- max(abs(cf_temp)) 
+  } 
+}
--- a/R/home/.Rhistory
+++ b/R/home/.Rhistory
@ -0,0 +1,25 @@
+library(rEDM)
+setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R/home")
+speed_brake_rpm_throttle <- read.csv("speed_brake_rpm_throttle_home.csv")
+View(speed_brake_rpm_throttle)
+View(speed_brake_rpm_throttle)
+ts_speed <- speed_brake_rpm_throttle$speed
+lib_speed <- c(1, 0.5*length(ts_speed))
+pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
+ts_rpm <- speed_brake_rpm_throttle$rpm
+lib_rpm <- c(1, 0.5*length(ts_rpm))
+pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
+ts_brake <- speed_brake_rpm_throttle$brake
+lib_brake <- c(1, 0.5*length(ts_brake))
+pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
+ts_throttle <- speed_brake_rpm_throttle$throttle
+lib_throttle <- c(1, 0.5*length(ts_throttle))
+pred_throttle <- c(0.5*length(ts_throttle)+1, length(ts_throttle))
+simplex_output_throttle <- simplex(ts_throttle, lib_throttle, pred_throttle, silent = TRUE)
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
+save(simplex_output_throttle, file = "simplex_output_throttle.Rda")
+save(simplex_output_speed, file = "simplex_output_speed.Rda")
+save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
+save(simplex_output_brake, file = "simplex_output_brake.Rda")
--- a/R/home/commands_list_home.txt
+++ b/R/home/commands_list_home.txt
@ -0,0 +1,266 @@
+par() features
+mar
+A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1
+
+oma
+A vector of the form c(bottom, left, top, right) giving the size of the outer margins in lines of text.
+
+mgp
+The margin line (in mex units) for the axis title, axis labels and axis line. Note that mgp[1] affects title whereas mgp[2:3] affect axis. The default is c(3, 1, 0).
+
+
+library(rEDM)
+setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R/home")
+# CDN: setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R/home")
+speed_brake_rpm_throttle <- read.csv("speed_brake_rpm_throttle_home.csv")
+
+===============================================================================================================================================================================
+		1. Perform Simplex Projection to determine best embedding dimension
+===============================================================================================================================================================================
+
+ts_speed <- speed_brake_rpm_throttle$speed
+lib_speed <- c(1, 0.5*length(ts_speed))
+pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
+
+ts_rpm <- speed_brake_rpm_throttle$rpm
+lib_rpm <- c(1, 0.5*length(ts_rpm))
+pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
+
+ts_brake <- speed_brake_rpm_throttle$brake
+lib_brake <- c(1, 0.5*length(ts_brake))
+pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
+
+ts_throttle <- speed_brake_rpm_throttle$throttle
+lib_throttle <- c(1, 0.5*length(ts_throttle))
+pred_throttle <- c(0.5*length(ts_throttle)+1, length(ts_throttle))
+
+simplex_output_throttle <- simplex(ts_throttle, lib_throttle, pred_throttle, silent = TRUE)
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
+
+save(simplex_output_throttle, file = "simplex_output_throttle.Rda")
+save(simplex_output_speed, file = "simplex_output_speed.Rda")
+save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
+save(simplex_output_brake, file = "simplex_output_brake.Rda")
+
+
+# If instead you're loading previous calculations....
+load(file = "simplex_output_speed.Rda")
+load(file = "simplex_output_rpm.Rda")
+load(file = "simplex_output_brake.Rda")
+load(file = "simplex_output_throttle.Rda")
+
+# If you're running into std::bad_alloc memory issues, try using 1/2 of the data....
+ts_speed <- speed_brake_rpm_throttle$speed
+lib_speed <- c(1, 0.1*length(ts_speed))
+pred_speed <- c(0.1*length(ts_speed)+1, 0.2*length(ts_speed))
+
+ts_rpm <- speed_brake_rpm_throttle$rpm
+lib_rpm <- c(1, 0.1*length(ts_rpm))
+pred_rpm <- c(0.1*length(ts_rpm)+1, 0.2*length(ts_rpm))
+
+ts_brake <- speed_brake_rpm_throttle$brake
+lib_brake <- c(1, 0.1*length(ts_brake))
+pred_brake <- c(0.1*length(ts_brake)+1, 0.2*length(ts_brake))
+
+ts_throttle <- speed_brake_rpm_throttle$throttle
+lib_throttle <- c(1, 0.1*length(ts_throttle))
+pred_throttle <- c(0.1*length(ts_throttle)+1, 0.2*length(ts_throttle))
+
+===============================================================================================================================================================================
+		1a. Plot and Save the Embedding Results
+===============================================================================================================================================================================
+
+par(mar = c(4, 4, 1, 1), mgp = c(2.5, 1, 0)) # set margins for plotting
+
+# create a 1-row, 3-column plot matrix (pg. 260 of R-cookbook)
+par(mfrow=c(1,3), cex=1.5)
+
+plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", xlab = "Embedding Dimension (E)", ylab = expression(paste("Forecast Skill (", rho, ")")), main="Vehicle Speed")
+plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", xlab = "Embedding Dimension (E)", main="Engine RPM")
+plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", xlab = "Embedding Dimension (E)", main="Brake Pressure")
+
+
+# IF YOU WANT TO PLOT ALL THREE ON ONE LINE DIRECTLY TO A 5.5" x 2.5# PDF
+# mgp affects (axis label offset, axis number offeset, axis line offset). Resize the window in RStudio in order to resize the plot output when using the Export button.
+pdf(file = "simplex_embedding_one_row_home.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,4), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0.9995, 1.0), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.9995, 1.0), labels = c(0.9995, 1.0))
+plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", ylim = c(0.9995, 1.0), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.9995, 1.0), labels = c(0.9995, 1.0))
+plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0.95, 1), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.95, 1), labels = c(0.95, 1))
+plot(simplex_output_throttle$E, simplex_output_throttle$rho, type = "l", axes = FALSE, main="Throttle Pressure", ylim = c(0.992, 1), xlim = c(0, 10))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.992, 1), labels = c(0.992, 1))
+title(xlab = "Embedding Dimension (E)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+
+===============================================================================================================================================================================
+		2. Re-run Simplex Projection with Optimal E to determine whether chaotic noise appears (Rho drops quickly with respect to increased tp)
+===============================================================================================================================================================================
+
+# Be sure to reload simplex_output_X using the E=9 computations before the next round of plots
+simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
+save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+
+simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, E = 9, tp = 1:1000)
+save(simplex_output_rpm, file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
+
+simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, E = 9, tp = 1:1000)
+save(simplex_output_brake, file = "simplex_output_brake_E_9_tp_1-1000.Rda")
+
+# If instead you're loading previous calculations....
+load(file = "simplex_output_speed_E_9_tp_1-1000.Rda")
+load(file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
+load(file = "simplex_output_brake_E_9_tp_1-1000.Rda")
+
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "simplex_forecasting_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(simplex_output_speed$tp, pmax(0, simplex_output_speed$rho), type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0, 1.0))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
+plot(simplex_output_rpm$tp, pmax(0, simplex_output_rpm$rho), type = "l", axes = FALSE, main="Engine RPM", ylim = c(0, 1.0))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
+plot(simplex_output_brake$tp, pmax(0, simplex_output_brake$rho), type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0, 1.0))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
+title(xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+===============================================================================================================================================================================
+					2. Distinguish between red noise and nonlinear deterministic behavior using S-maps
+===============================================================================================================================================================================
+
+smap_output_speed <- s_map(ts_speed, lib_speed, pred_speed, E = 9, silent = TRUE)
+save(smap_output_speed, file = "smap_output_speed.Rda")
+smap_output_rpm <- s_map(ts_rpm, lib_rpm, pred_rpm, E = 9, silent = TRUE)
+save(smap_output_rpm, file = "smap_output_rpm.Rda")
+smap_output_brake <- s_map(ts_brake, lib_brake, pred_brake, E = 9, silent = TRUE)
+save(smap_output_brake, file = "smap_output_brake.Rda")
+smap_output_throttle <- s_map(ts_throttle, lib_throttle, pred_throttle, E = 9, silent = TRUE)
+save(smap_output_throttle, file = "smap_output_throttle.Rda")
+
+
+load(file = "smap_output_rpm.Rda")
+load(file = "smap_output_speed.Rda")
+load(file = "smap_output_brake.Rda")
+load(file = "smap_output_throttle.Rda")
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "smap_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", xlim=c(0,8), ylim = c(0.99997, 0.999975))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.99997,0.999975), labels = c(0.99997,0.999975))
+plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", xlim=c(0,8), ylim = c(0.999931, 0.999937))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.999931,0.999937), labels = c(0.999931,0.999937))
+plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", xlim=c(0,8), ylim = c(0.986, 0.9885))
+axis(side = 1, labels = TRUE)
+axis(side = 2, at = c(0.986, 0.986, 0.9885), labels = c(0.986, 0.986, 0.9885))
+title(xlab = "Nonlinear Tuning Parameter (theta)", ylab = "Forecast Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+# To do individual plots
+plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999912,0.9999931))
+title(main = "S-Map Forecast Skill For Vehicle Speed")
+plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.99693,0.99696))
+title(main = "S-Map Forecast Skill For Brake Pressure")
+plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999842,0.9999857))
+title(main = "S-Map Forecast Skill For Engine Rotations Per Minute")
+
+
+===============================================================================================================================================================================
+					3. Perform Convergent Cross Mapping Between Variables Using Optimal Embedding Dimension E
+===============================================================================================================================================================================
+
+# num_samples = 10, lib_sizes = seq(0, 15000, by = 3000) causes 10 models at each of the 5 library sizes
+speed_xmap_rpm <- ccm(speed_brake_rpm_throttle, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_rpm, file = "speed_xmap_rpm.Rda")
+speed_xmap_brake <- ccm(speed_brake_rpm_throttle, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_brake, file = "speed_xmap_brake.Rda")
+
+brake_xmap_rpm <- ccm(speed_brake_rpm_throttle, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_rpm, file = "brake_xmap_rpm.Rda")
+brake_xmap_speed <- ccm(speed_brake_rpm_throttle, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_speed, file = "brake_xmap_speed.Rda")
+
+rpm_xmap_speed <- ccm(speed_brake_rpm_throttle, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_speed, file = "rpm_xmap_speed.Rda")
+rpm_xmap_brake <- ccm(speed_brake_rpm_throttle, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_brake, file = "rpm_xmap_brake.Rda")
+
+# X xmap Y: causal effect of (pred) Y onto (lib) X
+
+load(file = "speed_xmap_rpm.Rda")
+load(file = "speed_xmap_brake.Rda")
+load(file = "brake_xmap_rpm.Rda")
+load(file = "brake_xmap_speed.Rda")
+load(file = "rpm_xmap_speed.Rda")
+load(file = "rpm_xmap_brake.Rda")
+
+speed_xmap_rpm_mean <- ccm_means(speed_xmap_rpm)
+speed_xmap_brake_mean <- ccm_means(speed_xmap_brake)
+
+brake_xmap_rpm_mean <- ccm_means(brake_xmap_rpm)
+brake_xmap_speed_mean <- ccm_means(brake_xmap_speed)
+
+rpm_xmap_speed_mean <- ccm_means(rpm_xmap_speed)
+rpm_xmap_brake_mean <- ccm_means(rpm_xmap_brake)
+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+pdf(file = "ccm_one_row_city.pdf", width = 5.5, height = 2.0, family = "Helvetica")
+op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
+
+plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & RPM", xlim = c(3000, 15000), ylim = c(.5, .8))
+lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
+legend(x = "right", legend = c("S xmap R", "R xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
+axis(side = 2, labels = TRUE)
+
+plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Brake & RPM", xlim = c(3000, 15000), ylim = c(.1, .3))
+lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
+legend(x = "topright", legend = c("B xmap R", "R xmap B"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
+axis(side = 2, labels = TRUE)
+
+plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & Brake", xlim = c(3000, 15000), ylim = c(.45, .6))
+lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
+legend(x = "topright", legend = c("S xmap B", "B xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
+axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
+axis(side = 2, labels = TRUE)
+
+title(xlab = "Library Size", ylab = "Cross Map Skill (rho)", outer = TRUE)
+par(op)
+dev.off()
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+
+plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.86, 0.95))
+lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
+legend(x = "topleft", legend = c("Speed xmap Engine RPM", "Engine RPM xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Speed and Engine RPM")
+
+plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.35, 0.48))
+lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
+legend(x = "left", legend = c("Brake Pressure xmap Engine RPM", "Engine RPM xmap Brake Pressure"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Brake Pressure and Engine RPM")
+
+plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.6, 0.9))
+lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
+legend(x = "left", legend = c("Speed xmap Brake Pressure", "Brake Pressure xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
+title(main = "Cross Map Skill Between Speed and Brake Pressure")
--- a/R/home/simplex_embedding_one_row_home.pdf
+++ b/R/home/simplex_embedding_one_row_home.pdf
--- a/R/home/simplex_output_brake.Rda
+++ b/R/home/simplex_output_brake.Rda
--- a/R/home/simplex_output_rpm.Rda
+++ b/R/home/simplex_output_rpm.Rda
--- a/R/home/simplex_output_speed.Rda
+++ b/R/home/simplex_output_speed.Rda
--- a/R/home/simplex_output_throttle.Rda
+++ b/R/home/simplex_output_throttle.Rda
--- a/R/home/speed_brake_rpm_throttle_home.csv
+++ b/R/home/speed_brake_rpm_throttle_home.csv
--- a/R/load_xmap_Rda.R
+++ b/R/load_xmap_Rda.R
@ -0,0 +1,12 @@
+load(file = "rpm_xmap_speed.Rda")
+load(file = "rpm_xmap_brake.Rda")
+load(file = "brake_xmap_speed.Rda")
+load(file = "brake_xmap_rpm.Rda")
+load(file = "speed_xmap_rpm.Rda")
+load(file = "speed_xmap_brake.Rda")
+ccm_out_1 <- ccm_means(speed_xmap_rpm)
+ccm_out_2 <- ccm_means(speed_xmap_brake)
+ccm_out_3 <- ccm_means(brake_xmap_rpm)
+ccm_out_4 <- ccm_means(brake_xmap_speed)
+ccm_out_5 <- ccm_means(rpm_xmap_speed)
+ccm_out_6 <- ccm_means(rpm_xmap_brake)
--- a/R/plant_data.csv
+++ b/R/plant_data.csv
--- a/R/plot_causal_onto_brake.R
+++ b/R/plot_causal_onto_brake.R
@ -0,0 +1,31 @@
+plot.new()
+
+x <- ccm_out_2$lib_size
+y_rpm_xmap_brake <- ccm_out_6$rho
+y_speed_xmap_brake <- ccm_out_2$rho
+
+plot(5, 
+     5, 
+     xlim=c(5000, 15000),
+     xlab = "Library Size",
+     ylim = c(.4,.9),
+     ylab = "Forcast Skill (rho)")
+
+lines(x, y_rpm_xmap_brake, col="red", lwd=2)
+lines(x, y_speed_xmap_brake, col="blue", lwd=2)
+
+# Add a title
+title("Causal Relationships Onto Brake Pressure")
+
+# Add legend to plot 
+legend("topright", 
+       inset=.2, 
+       cex = 1, 
+       #title="Legend", 
+       c("RPM Xmap Brake","Speed Xmap Brake"), 
+       horiz=FALSE, 
+       lty=c(1,1), 
+       lwd=c(2,2), 
+       col=c("red","blue"), 
+       bg="grey96")
+
--- a/R/plot_causal_onto_rpm.R
+++ b/R/plot_causal_onto_rpm.R
@ -0,0 +1,31 @@
+plot.new()
+
+x <- ccm_out_3$lib_size
+y_brake_xmap_rpm <- ccm_out_3$rho
+y_speed_xmap_rpm <- ccm_out_1$rho
+
+plot(5, 
+     5, 
+     xlim=c(5000, 15000),
+     xlab = "Library Size",
+     ylim = c(.3,1),
+     ylab = "Forcast Skill (rho)")
+
+lines(x, y_brake_xmap_rpm, col="red", lwd=2)
+lines(x, y_speed_xmap_rpm, col="blue", lwd=2)
+
+# Add a title
+title("Causal Relationships Onto Engine RPM")
+
+# Add legend to plot 
+legend("topright", 
+       inset=.2, 
+       cex = 1, 
+       #title="Legend", 
+       c("Brake Xmap RPM","Speed Xmap RPM"), 
+       horiz=FALSE, 
+       lty=c(1,1), 
+       lwd=c(2,2), 
+       col=c("red","blue"), 
+       bg="grey96")
+
--- a/R/plot_causal_onto_speed.R
+++ b/R/plot_causal_onto_speed.R
@ -0,0 +1,31 @@
+plot.new()
+
+x <- ccm_out_4$lib_size
+y_brake_xmap_speed <- ccm_out_4$rho
+y_rpm_xmap_speed <- ccm_out_5$rho
+
+plot(5, 
+     5, 
+     xlim=c(5000, 15000),
+     xlab = "Library Size",
+     ylim = c(.6,.9),
+     ylab = "Forcast Skill (rho)")
+
+lines(x, y_brake_xmap_speed, col="red", lwd=2)
+lines(x, y_rpm_xmap_speed, col="blue", lwd=2)
+
+# Add a title
+title("Causal Relationships Onto Speed")
+
+# Add legend to plot 
+legend("topright", 
+       inset=.1, 
+       cex = 1, 
+       #title="Legend", 
+       c("Brake Xmap Speed","RPM Xmap Speed"), 
+       horiz=FALSE, 
+       lty=c(1,1), 
+       lwd=c(2,2), 
+       col=c("red","blue"), 
+       bg="grey96")
+
--- a/R/rpm_data.csv
+++ b/R/rpm_data.csv
--- a/R/rpm_xmap_brake
+++ b/R/rpm_xmap_brake
--- a/R/rpm_xmap_brake.Rda
+++ b/R/rpm_xmap_brake.Rda
--- a/R/rpm_xmap_brake_v2.Rda
+++ b/R/rpm_xmap_brake_v2.Rda
--- a/R/rpm_xmap_speed.Rda
+++ b/R/rpm_xmap_speed.Rda
--- a/R/rpm_xmap_speed_v2.Rda
+++ b/R/rpm_xmap_speed_v2.Rda
--- a/R/run_all_xmap.R
+++ b/R/run_all_xmap.R
@ -0,0 +1,14 @@
+speed_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_rpm, file = "speed_xmap_rpm_v2.Rda")
+speed_xmap_brake <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(speed_xmap_brake, file = "speed_xmap_brake_v2.Rda")
+
+brake_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_rpm, file = "brake_xmap_rpm_v2.Rda")
+brake_xmap_speed <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
+save(brake_xmap_speed, file = "brake_xmap_speed_v2.Rda")
+
+rpm_xmap_speed <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_speed, file = "rpm_xmap_speed_v2.Rda")
+rpm_xmap_brake <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0,  silent = TRUE)
+save(rpm_xmap_brake, file = "rpm_xmap_brake_v2.Rda")
--- a/R/simplex_output_brake.Rda
+++ b/R/simplex_output_brake.Rda
--- a/R/simplex_output_brake.pdf
+++ b/R/simplex_output_brake.pdf
--- a/R/simplex_output_brake_E_9_tp_1-10.Rda
+++ b/R/simplex_output_brake_E_9_tp_1-10.Rda
--- a/R/simplex_output_brake_E_9_tp_1-100.Rda
+++ b/R/simplex_output_brake_E_9_tp_1-100.Rda
--- a/R/simplex_output_brake_E_9_tp_1-1000.Rda
+++ b/R/simplex_output_brake_E_9_tp_1-1000.Rda
--- a/R/simplex_output_brake_tp_rho.pdf
+++ b/R/simplex_output_brake_tp_rho.pdf
--- a/R/simplex_output_brake_tp_rho_1-100.pdf
+++ b/R/simplex_output_brake_tp_rho_1-100.pdf
--- a/R/simplex_output_brake_tp_rho_1-1000.pdf
+++ b/R/simplex_output_brake_tp_rho_1-1000.pdf
--- a/R/simplex_output_rpm.Rda
+++ b/R/simplex_output_rpm.Rda
--- a/R/simplex_output_rpm.pdf
+++ b/R/simplex_output_rpm.pdf
--- a/R/simplex_output_rpm_E_10_tp_1-10.Rda
+++ b/R/simplex_output_rpm_E_10_tp_1-10.Rda
--- a/R/simplex_output_rpm_E_9_tp_1-100.Rda
+++ b/R/simplex_output_rpm_E_9_tp_1-100.Rda
--- a/R/simplex_output_rpm_E_9_tp_1-1000.Rda
+++ b/R/simplex_output_rpm_E_9_tp_1-1000.Rda
--- a/R/simplex_output_rpm_E_9_tp_1-500.Rda
+++ b/R/simplex_output_rpm_E_9_tp_1-500.Rda
--- a/R/simplex_output_rpm_tp_rho.pdf
+++ b/R/simplex_output_rpm_tp_rho.pdf
--- a/R/simplex_output_rpm_tp_rho_1-100.pdf
+++ b/R/simplex_output_rpm_tp_rho_1-100.pdf
--- a/Show More
+++ b/Show More