Initial source code commit

This commit is contained in:
Brent 2019-03-08 15:17:12 -05:00
parent 69ea19f719
commit 94bf187ecd
124 changed files with 276183 additions and 0 deletions

BIN
AFIT-END-DS-18-D-003.pdf Normal file

Binary file not shown.

89
Pipeline/ArbID.py Normal file
View File

@ -0,0 +1,89 @@
from typing import Callable, List
from pandas import DataFrame
from numpy import float64, logical_xor, mean, ndarray, sqrt, std, sum, uint8, zeros
from PipelineTimer import PipelineTimer
# noinspection PyArgumentList
class ArbID:
def __init__(self, arb_id: int):
self.id: int = arb_id
# These features are set by PreProcessing.py's generate_arb_id_dictionary
self.dlc: int = 0
self.original_data: DataFrame = None
# These features are set in generate_binary_matrix_and_tang called by generate_arb_id_dictionary
self.boolean_matrix: ndarray = None
self.tang: ndarray = None
self.static: bool = True
# These features are set in analyze_transmission_frequency called by generate_arb_id_dictionary
self.ci_sensitivity: float = 0.0
self.freq_mean: float = 0.0
self.freq_std: float = 0.0
self.freq_ci: tuple = None
self.mean_to_ci_ratio: float = 0.0
self.synchronous: bool = False
# These features are set by LexicalAnalysis.py's get_composition
self.tokenization: List[tuple] = []
self.padding: List[int] = []
def generate_binary_matrix_and_tang(self, a_timer: PipelineTimer, normalize_strategy: Callable):
a_timer.start_nested_function_time()
self.boolean_matrix = zeros((self.original_data.__len__(), self.dlc * 8), dtype=uint8)
for i, row in enumerate(self.original_data.itertuples()):
for j, cell in enumerate(row[1:]):
# Skip cells that were already 0
if cell > 0:
# i is the row in the boolean_matrix
# j*8 is the left hand bit for this byte in the payload
# j*8 + 8 is the right hand bit for this byte in the payload
# e.g. byte index 1 starts at bits 1*8 = 8 to 1*8+8 = 16; [8:16]
# likewise, byte index 7 starts at bits 7*8 = 56 to 7*8+8 = 64
# Numpy indexing is non-inclusive of the upper bound. So [0:8] is the first 7 elements
bin_string = format(cell, '08b')
self.boolean_matrix[i, j * 8:j * 8 + 8] = [x == '1' for x in bin_string]
a_timer.set_hex_to_bool_matrix()
if self.boolean_matrix.shape[0] > 1:
a_timer.start_nested_function_time()
transition_matrix = logical_xor(self.boolean_matrix[:-1, ], self.boolean_matrix[1:, ])
self.tang = sum(transition_matrix, axis=0, dtype=float64)
# Ensure there is no divide by zero issues
if max(self.tang) > 0:
normalize_strategy(self.tang, axis=0, copy=False)
self.static = False
a_timer.set_bool_matrix_to_tang()
def analyze_transmission_frequency(self,
time_convert: int = 1000,
ci_accuracy: float = 1.645,
synchronous_threshold: float = 0.1):
# Don't bother doing this if there isn't enough data. You would need to handle a bunch of edge cases anyways.
if self.original_data.__len__() < 4:
return
self.ci_sensitivity = ci_accuracy
# time_convert = 1000 is intended to convert seconds to milliseconds.
freq_intervals = self.original_data.index[1:] - self.original_data.index[:-1]
self.freq_mean = mean(freq_intervals) * time_convert
self.freq_std = std(freq_intervals, ddof=1)*time_convert
# Assumes distribution of freq_intervals is gaussian normal.
mean_offset = ci_accuracy * self.freq_std / sqrt(len(freq_intervals))
self.freq_ci = (self.freq_mean - mean_offset, self.freq_mean + mean_offset)
# mean_to_ci_ratio is the ratio of the CI range to the mean value. This is to provide heuristic to determine
# if this Arb ID can be called 'synchronous' given the time scale of its average transmission frequency.
# For example, imagine an Arb ID transmitting with a mean frequency of exactly one second. Its 90% Confidence
# Interval spans 50 milliseconds about that ideal mean. It's reasonable to assume this Arb ID was engineered to
# be a 'synchronous' signal. However, a different Arb ID with a mean frequency of 40 milliseconds and the same
# confidence interval could not be reasonably assumed to have been engineered to be 'synchronous'. This other
# Arb ID was engineered to be 'high frequency' compared to the first example, but there isn't evidence that it's
# intended to be a high frequency 'synchronous' process that adheres to a certain clock frequency.
# This inference relies upon the assumption that the OEM didn't engineer the bus to be a train wreck with some
# IDs losing an arbitrarily large percentage of their arbitration phases.
self.mean_to_ci_ratio = 2*mean_offset/self.freq_mean
if self.mean_to_ci_ratio <= synchronous_threshold:
self.synchronous = True

97
Pipeline/J1979.py Normal file
View File

@ -0,0 +1,97 @@
from pandas import DataFrame, Series
from numpy import int8, float16, uint8, uint16
class J1979:
def __init__(self, pid: int, original_data: DataFrame):
self.pid: int = pid
self.title: str = ""
self.data: Series = self.process_response_data(original_data)
print("Found " + str(self.data.shape[0]) + " responses for J1979 PID " + str(hex(self.pid)) + ":", self.title)
def process_response_data(self, original_data) -> Series:
# ISO-TP formatted Universal Diagnostic Service (UDS) requests that were sent by the CAN collection device
# during sampling. Request made using Arb ID 0x7DF with DLC of 8. Response should use Arb ID 7E8 (0x7DF + 0x8).
# DataFrame Columns: b0 b1 b2 b3 ... b7
# -- -- -- -- --
# PID 0x0C (12 dec) (Engine RPM): 02 01 0c 00 ... 00
# PID 0x0D (13 dec) (Vehicle Speed): 02 01 0d 00 ... 00
# PID 0x11 (17 dec) (Trottle Pos.): 02 01 11 00 ... 00
# PID 0x61 (97 dec) (Demand % Torque): 02 01 61 00 ... 00
# PID 0x62 (98 dec) (Engine % Torque): 02 01 62 00 ... 00
# PID 0x63 (99 dec) (Ref. Torque): 02 01 63 00 ... 00
# PID 0x8e (142 dec) (Friction Torque): 02 01 8e 00 ... 00
# Responses being managed here should follow the ISO-TP + UDS per-byte format AA BB CC DD .. DD
# BYTE: AA BB CC DD ... DD
# USE: response size (bytes) UDS mode + 0x40 UDS PID response data
# DF COLUMN: b0 b1 b2 b3 ... b7
# Remember that this response data is already converted to decimal. Thus, byte BB = 65 = 0x41 = 0x01 + 0x40.
# If BB isn't 0x41, check what the error code is. Some error code are listed in the UDS chapter of the car
# hacker's handbook available at http://opengarages.org/handbook/ebook/.
if self.pid == 12:
self.title = 'Engine RPM'
# PID is 0x0C: Engine RPM. 2 byte of data AA BB converted using 1/4 RPM per bit: (256*AA+BB)/4
# Min value: 0 Max value: 16,383.75 units: rpm
return Series(data=(256*original_data['b3']+original_data['b4'])/4,
index=original_data.index,
name=self.title,
dtype=float16)
elif self.pid == 13:
self.title = 'Speed km/h'
# PID is 0x0D: Vehicle Speed. 1 byte of data AA using 1km/h per bit: no conversion necessary
# Min value: 0 Max value: 255 (158.44965mph) units: km/h
return Series(data=original_data['b3'],
index=original_data.index,
name=self.title,
dtype=uint8)
elif self.pid == 17:
self.title = 'Throttle %'
# PID is 0x11: Throttle Position. 1 byte of data AA using 100/255 % per bit: AA * 100/255% throttle.
# Min value: 0 Max value: 100 units: %
return Series(data=100 * original_data['b3'] / 255,
index=original_data.index,
name=self.title,
dtype=uint8)
elif self.pid == 97:
self.title = 'Demand Torque %'
# PID is 0x61: Driver's demand engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
# AA - 125
# Min value: -125 Max value: 130 units: %
return Series(data=original_data['b3'] - 125,
index=original_data.index,
name=self.title,
dtype=int8)
elif self.pid == 98:
self.title = 'Actual Torque %'
# PID is 0x62: Actual engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
# AA - 125
# Min value: -125 Max value: 130 units: %
return Series(data=original_data['b3'] - 125,
index=original_data.index,
name=self.title,
dtype=int8)
elif self.pid == 99:
self.title = 'Reference Torque Nm'
# PID is 0x63: Engine reference torque. 2 byte of data AA BB using 1 Nm/bit: 256*AA + BB Nm torque
# Min value: 0 Max value: 65,535 units: Nm
return Series(data=256*original_data['b3'] + original_data['b4'],
index=original_data.index,
name=self.title,
dtype=uint16)
elif self.pid == 142:
self.title = 'Engine Friction Torque %'
# PID is 0x8E: Engine Friction - Percent Torque. 1 byte of data AA using 1%/bit with -125 offset. AA - 125
# Min value: -125 Max value: 130 units: %
return Series(data=original_data['b3'] - 125,
index=original_data.index,
name=self.title,
dtype=int8)
else:
# Looks like you were requesting J1979 data with your sniffer that hasn't been implemented in this code.
# Time to do the leg work to expand this class accordingly then re-run the pipeline.
raise ValueError("Encountered J1979 PID " + str(hex(self.pid)) +
" in Pre-Processing that hasn't been programmed. Expand the J1979 class to handle all "
"J1979 requests made during data collection.")

177
Pipeline/LexicalAnalysis.py Normal file
View File

@ -0,0 +1,177 @@
from numpy import float64, nditer, uint64, zeros
from pandas import Series
from os import path, remove
from pickle import load
from ArbID import ArbID
from Signal import Signal
from PipelineTimer import PipelineTimer
def tokenize_dictionary(a_timer: PipelineTimer,
d: dict,
force: bool = False,
include_padding: bool = False,
merge: bool = True,
max_distance: float= 0.1):
a_timer.start_function_time()
for k, arb_id in d.items():
if not arb_id.static:
if arb_id.padding and not force:
print("\nTokenization already completed and forcing is turned off. Skipping...")
return
a_timer.start_iteration_time()
get_composition(arb_id, include_padding, max_distance)
a_timer.set_tang_to_composition()
if merge:
a_timer.start_iteration_time()
merge_tokens(arb_id, max_distance)
a_timer.set_composition_merge()
a_timer.set_tokenization()
# This is a greedy algorithm to cluster bit positions in a series of CAN payloads suspected of being part of a
# continuous numerical time series.
def get_composition(arb_id: ArbID, include_padding=False, max_inversion_distance: float = 0.0):
tokens = []
start_index = 0
currently_clustering = False
big_endian = True
last_bit_position = 0
# Consider each element in the TANG. The TANG is an ndarray with index being bit position from the
# original CAN data. The cell value is the observed transition frequency for that bit position.
for i, bit_position in enumerate(nditer(arb_id.tang)):
# Is this a padding bit?
if bit_position <= 0.000001:
arb_id.padding.append(i)
# Are we clustering padding bits? If so, proceed to the normal clustering logic. Else, do the following.
if not include_padding:
if currently_clustering:
# This is padding, we're already clustering, and we're not clustering padding; save the token.
tokens.append((start_index, i - 1))
currently_clustering = False
start_index = i + 1
last_bit_position = bit_position
continue
# Are we still enlarging the current token?
if currently_clustering:
if bit_position >= last_bit_position and big_endian:
pass
elif bit_position <= last_bit_position and not big_endian:
pass
# Are we allowing inversions (max_inversion_distance > 0)? If so, check if this inversion is acceptable.
elif abs(bit_position-last_bit_position) <= max_inversion_distance:
pass
# Is this the second bit position we need to establish the endian of the signal?
elif start_index == i - 1:
if bit_position >= last_bit_position:
big_endian = True
else:
big_endian = False
# This is an unacceptable transition frequency inversion, save the current token and start a new one
else:
tokens.append((start_index, i - 1))
start_index = i
# We aren't currently clustering and we intend to cluster this bit position
else:
currently_clustering = True
start_index = i
last_bit_position = bit_position
# We reached the last bit position while clustering. Add this final token.
if currently_clustering:
tokens.append((start_index, arb_id.tang.__len__() - 1))
arb_id.tokenization = tokens
def merge_tokens(arb_id: ArbID, max_distance):
# if arb_id.id == 292: # make this equal to the decimal value of an Arb ID in the data you want to see get merged
# verbose = True
# else:
verbose = False
if verbose:
print("\nENTERING MERGE PHASE OF ARB ID", arb_id.id)
print("STARTING TOKENS:", arb_id.tokenization)
if arb_id.static or arb_id.tokenization.__len__() < 2:
# Make sure there's multiple tokens to marge
pass
else:
# Editing data structures while iterating over them is a bad idea in Python
# Instead, lets keep track of tokens we want to delete using remove_list.
remove_list = []
last = None
for i, token in enumerate(arb_id.tokenization):
if verbose:
print("Last:", last, "\tCurrent:", token)
if last:
# Are these tokens adjacent?
if last[1] + 1 == token[0]:
if verbose:
print("\tAdjacent with distance of", abs(arb_id.tang[last[1]] - arb_id.tang[token[0]]))
# Is the transition frequency of the adjacent bit positions less than the max distance threshold?
if abs(arb_id.tang[last[1]] - arb_id.tang[token[0]]) <= max_distance:
remove_list.append(last)
token = (last[0], token[1])
arb_id.tokenization[i] = token
if verbose:
print("\t\tMerged into", token)
last = token
if remove_list:
for token in remove_list:
arb_id.tokenization.remove(token)
if verbose:
print("final tokenization", arb_id.tokenization)
# noinspection PyTypeChecker
def generate_signals(a_timer: PipelineTimer,
arb_id_dict: dict,
signal_pickle_filename: str,
normalize_strategy,
force=False):
if path.isfile(signal_pickle_filename):
if force:
# Remove any existing pickled Signal dictionary and create one.
remove(signal_pickle_filename)
else:
print("\nSignal generation already completed and forcing is turned off. Using pickled data...")
return load(open(signal_pickle_filename, "rb"))
a_timer.start_function_time()
signal_dict = {}
for k, arb_id in arb_id_dict.items():
if not arb_id.static:
for token in arb_id.tokenization:
a_timer.start_iteration_time()
signal = Signal(k, token[0], token[1])
# Convert the binary ndarray to a list of string representations of each row
temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
temp2 = zeros((temp1.__len__(), 1), dtype=uint64)
# convert each string representation to int
for i, row in enumerate(temp1):
temp2[i] = int(row, 2)
# create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
signal.time_series = Series(temp2[:, 0], index=arb_id.original_data.index, dtype=float64)
# Normalize the signal and update its meta-data
signal.normalize_and_set_metadata(normalize_strategy)
# add this signal to the signal dictionary which is keyed by Arbitration ID
if k in signal_dict:
signal_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
else:
signal_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
a_timer.set_token_to_signal()
a_timer.set_signal_generation()
return signal_dict

198
Pipeline/Main.py Normal file
View File

@ -0,0 +1,198 @@
from os import chdir, mkdir, path, remove
from pickle import dump
from sklearn.preprocessing import minmax_scale
from typing import Callable
from PreProcessor import PreProcessor
from LexicalAnalysis import tokenize_dictionary, generate_signals
from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal_clustering, label_propagation, \
j1979_signal_labeling
from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster
from PipelineTimer import PipelineTimer
# File names for the on-disc data input and output.
# Input:
can_data_filename: str = 'drive_runway_afit.log'
# can_data_filename: str = 'loggerProgram0.log'
# Output:
output_folder: str = 'output'
pickle_arb_id_filename: str = 'pickleArbIDs.p'
pickle_j1979_filename: str = 'pickleJ1979.p'
pickle_signal_filename: str = 'pickleSignals.p'
pickle_subset_filename: str = 'pickleSubset.p'
csv_correlation_filename: str = 'subset_correlation_matrix.csv'
pickle_j1979_correlation: str = 'pickleJ1979_correlation.p'
pickle_clusters_filename: str = 'pickleClusters.p'
pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
csv_all_signals_filename: str = 'complete_correlation_matrix.csv'
pickle_timer_filename: str = 'pickleTimer.p'
# Change out the normalization strategies as needed.
tang_normalize_strategy: Callable = minmax_scale
signal_normalize_strategy: Callable = minmax_scale
# Turn on or off portions of the pipeline and output methods using these flags.
force_pre_processing: bool = False
force_j1979_plotting: bool = False
force_lexical_analysis: bool = False
force_arb_id_plotting: bool = True
force_semantic_analysis: bool = False
force_signal_labeling: bool = False
use_j1979_tags_in_plots: bool = True
force_cluster_plotting: bool = False
dump_to_pickle: bool = True
# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
time_conversion = 1000 # convert seconds to milliseconds
z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
freq_analysis_accuracy = z_lookup[0.9]
freq_synchronous_threshold = 0.1
# Threshold parameters used during lexical analysis.
tokenization_bit_distance: float = 0.2
tokenize_padding: bool = True
# Threshold parameters used during semantic analysis
subset_selection_size: float = 0.25
fuzzy_labeling: bool = True
min_correlation_threshold: float = 0.85
# A timer class to record timings throughout the pipeline.
a_timer = PipelineTimer(verbose=True)
# DATA IMPORT AND PRE-PROCESSING #
pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
tang_normalize_strategy,
time_conversion,
freq_analysis_accuracy,
freq_synchronous_threshold,
force_pre_processing)
if j1979_dictionary:
plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
# LEXICAL ANALYSIS #
print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
tokenize_dictionary(a_timer,
id_dictionary,
force_lexical_analysis,
include_padding=tokenize_padding,
merge=True,
max_distance=tokenization_bit_distance)
signal_dictionary = generate_signals(a_timer,
id_dictionary,
pickle_signal_filename,
signal_normalize_strategy,
force_lexical_analysis)
plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting)
# SEMANTIC ANALYSIS #
print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
subset_df = subset_selection(a_timer,
signal_dictionary,
pickle_subset_filename,
force_semantic_analysis,
subset_size=subset_selection_size)
corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
cluster_dict = greedy_signal_clustering(corr_matrix_subset,
correlation_threshold=min_correlation_threshold,
fuzzy_labeling=fuzzy_labeling)
df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
pickle_clusters_filename=pickle_clusters_filename,
pickle_all_signals_df_filename=pickle_all_signal_filename,
csv_signals_correlation_filename=csv_all_signals_filename,
signal_dict=signal_dictionary,
cluster_dict=cluster_dict,
correlation_threshold=min_correlation_threshold,
force=force_semantic_analysis)
signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
j1979_corr_filename=pickle_j1979_correlation,
df_signals=df_full,
j1979_dict=j1979_dictionary,
signal_dict=signal_dictionary,
correlation_threshold=min_correlation_threshold,
force=force_signal_labeling)
plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting)
# DATA STORAGE #
if dump_to_pickle:
if force_pre_processing:
if path.isfile(pickle_arb_id_filename):
remove(pickle_arb_id_filename)
if path.isfile(pickle_j1979_filename):
remove(pickle_j1979_filename)
if force_lexical_analysis or force_signal_labeling:
if path.isfile(pickle_signal_filename):
remove(pickle_signal_filename)
if force_semantic_analysis:
if path.isfile(pickle_subset_filename):
remove(pickle_subset_filename)
if path.isfile(csv_correlation_filename):
remove(csv_correlation_filename)
if path.isfile(pickle_j1979_correlation):
remove(pickle_j1979_correlation)
if path.isfile(pickle_clusters_filename):
remove(pickle_clusters_filename)
if path.isfile(pickle_all_signal_filename):
remove(pickle_all_signal_filename)
if path.isfile(csv_all_signals_filename):
remove(csv_all_signals_filename)
timer_flag = 0
if not path.exists(output_folder):
mkdir(output_folder)
chdir(output_folder)
if not path.isfile(pickle_arb_id_filename):
timer_flag += 1
print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_j1979_filename):
timer_flag += 1
print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_signal_filename):
timer_flag += 1
print("\nDumping signal dictionary to " + pickle_signal_filename)
dump(signal_dictionary, open(pickle_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_subset_filename):
timer_flag += 1
print("\nDumping signal subset list to " + pickle_subset_filename)
dump(subset_df, open(pickle_subset_filename, "wb"))
print("\tComplete...")
if not path.isfile(csv_correlation_filename):
timer_flag += 1
print("\nDumping subset correlation matrix to " + csv_correlation_filename)
corr_matrix_subset.to_csv(csv_correlation_filename)
print("\tComplete...")
if not path.isfile(pickle_j1979_correlation):
timer_flag += 1
print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
print("\tComplete...")
if not path.isfile(pickle_clusters_filename):
timer_flag += 1
print("\nDumping cluster dictionary to " + pickle_clusters_filename)
dump(cluster_dict, open(pickle_clusters_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_all_signal_filename):
timer_flag += 1
print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
dump(df_full, open(pickle_all_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(csv_all_signals_filename):
timer_flag += 1
print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
corr_matrix_full.to_csv(csv_all_signals_filename)
print("\tComplete...")
if timer_flag is 9:
print("\nDumping pipeline timer to " + pickle_timer_filename)
dump(a_timer, open(pickle_timer_filename, "wb"))
print("\tComplete...")
chdir("..")

149
Pipeline/PipelineTimer.py Normal file
View File

@ -0,0 +1,149 @@
from time import time
from typing import List
class PipelineTimer:
def __init__(self, verbose: bool = True):
self.verbose: bool = verbose
self.function_time: float = 0.0
self.nested_function_time: float = 0.0
self.iteration_time: float = 0.0
self.nested_iteration_time: float = 0.0
# Pre-Processing Timings
self.can_csv_to_df: float = 0.0
self.raw_df_to_arb_id_dict: float = 0.0
self.arb_id_creation: List[float] = []
self.j1979_creation: float = 0.0
self.hex_to_bool_matrix: List[float] = []
self.bool_matrix_to_tang: List[float] = []
self.plot_save_j1979_dict: float = 0.0
self.plot_save_j1979_pid: List[float] = []
# Lexical Analysis Timings
self.tokenization: float = 0.0
self.tang_to_composition: List[float] = []
self.composition_merge: List[float] = []
self.signal_generation: float = 0.0
self.token_to_signal: List[float] = []
self.plot_save_arb_id_dict: float = 0.0
self.plot_save_arb_id: List[float] = []
# Semantic Analysis Timings
self.subset_selection: float = 0.0
self.plot_save_cluster_dict: float = 0.0
self.label_propagation: float = 0.0
self.plot_save_cluster: List[float] = []
def start_function_time(self):
self.function_time = time()
def start_nested_function_time(self):
self.nested_function_time = time()
def start_iteration_time(self):
self.iteration_time = time()
def start_nested_iteration_time(self):
self.nested_iteration_time = time()
# Pre-Processing Timings #
# Called in PreProcessor.import_csv
def set_can_csv_to_df(self):
self.can_csv_to_df = time() - self.function_time
if self.verbose:
print("\n" + str(self.can_csv_to_df) + " seconds to import and format raw data into a DataFrame")
# Called in PreProcessor.generate_arb_id_dictionary
def set_raw_df_to_arb_id_dict(self):
self.raw_df_to_arb_id_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.raw_df_to_arb_id_dict) +
" seconds to produce arbitration ID dictionary, boolean matrices, and TANGs")
# Called in the loop within PreProcessor.generate_arb_id_dictionary
def set_arb_id_creation(self):
self.arb_id_creation.append(time() - self.iteration_time)
def set_j1979_creation(self):
self.j1979_creation = time() - self.nested_function_time
if self.verbose:
print("\n" + str(self.j1979_creation) + " seconds to process J1979 response data into a dictionary")
# Called in ArbID.generate_binary_matrix_and_tang
def set_hex_to_bool_matrix(self):
self.hex_to_bool_matrix.append(self.nested_function_time - time())
# Called in ArbID.generate_binary_matrix_and_tang
def set_bool_matrix_to_tang(self):
self.bool_matrix_to_tang.append(self.nested_function_time - time())
# Called in the Plotter.py plot_j1979 function.
def set_plot_save_j1979_dict(self):
self.plot_save_j1979_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.plot_save_j1979_dict) + " seconds to plot and save the J1979 response data")
# Called in the loop within the Plotter.py plot_j1979 function.
def set_plot_save_j1979_pid(self):
self.plot_save_j1979_pid.append(time() - self.iteration_time)
# Lexical Analysis Timings #
# Called in the LexicalAnalysis.py tokenize_dictionary function.
def set_tokenization(self):
self.tokenization = time() - self.function_time
if self.verbose:
print("\n" + str(self.tokenization) + " seconds to tokenize the arbitration ID dictionary using TANGs")
# Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
def set_tang_to_composition(self):
self.tang_to_composition.append(time() - self.iteration_time)
# Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
def set_composition_merge(self):
self.composition_merge.append(time() - self.iteration_time)
# Called in the LexicalAnalysis.py generate_signals function.
def set_signal_generation(self):
self.signal_generation = time() - self.function_time
if self.verbose:
print("\n" + str(self.signal_generation) +
" seconds to generate signals and their statistics using token compositions.")
# Called in the loop within the LexicalAnalysis.py generate_signals function.
def set_token_to_signal(self):
self.token_to_signal.append(time() - self.iteration_time)
# Called in the Plotter.py plot_signals_by_arb_id function.
def set_plot_save_arb_id_dict(self):
self.plot_save_arb_id_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.plot_save_arb_id_dict) + " seconds to plot and save the Signals and TANGs by Arb ID")
# Called in the loop within the Plotter.py plot_signals_by_arb_id function.
def set_plot_save_arb_id(self):
self.plot_save_arb_id.append(time() - self.iteration_time)
# Semantic Analysis Timings #
# Called in the SemanticAnalysis.py subset_correlation function.
def set_subset_selection(self):
self.subset_selection = time() - self.function_time
# Called in the SemanticAnalysis.py label_propagation function.
def set_label_propagation(self):
self.label_propagation = time() - self.function_time
if self.verbose:
print("\n" + str(self.label_propagation) + " seconds to perform label propagation.")
# Called in the Plotter.py plot_signals_by_cluster function.
def set_plot_save_cluster_dict(self):
self.plot_save_cluster_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.plot_save_cluster_dict) + " seconds to plot and save the clusters.")
# Called in the loop within the Plotter.py plot_signals_by_cluster function.
def set_plot_save_cluster(self):
self.plot_save_cluster.append(time() - self.iteration_time)

221
Pipeline/Plotter.py Normal file
View File

@ -0,0 +1,221 @@
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from numpy import where, isin
from os import chdir, mkdir, path
from shutil import rmtree
from PipelineTimer import PipelineTimer
figure_format: str = "png" # png works well for quickly previewing results and draft results write ups.
# figure_format: str = "eps" # Journals/Conferences generally prefer EPS file format for camera-ready copies.
figure_dpi: int = 300
figure_transp: bool = False
arb_id_folder: str = 'figures'
cluster_folder: str = 'clusters'
j1979_folder: str = 'j1979'
def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False):
if path.exists(arb_id_folder):
if force:
rmtree(arb_id_folder)
else:
print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
for k_id, signals in signal_dict.items():
arb_id = arb_id_dict[k_id]
if not arb_id.static:
print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
a_timer.start_iteration_time()
signals_to_plot = []
# Don't plot the static signals
for k_signal, signal in signals.items():
if not signal.static:
signals_to_plot.append(signal)
# One row per signal plus one for the TANG
fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id),
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
# The min() statement provides whitespace for the title depending on the number of subplots.
size_adjust = len(signals_to_plot) / 100
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
for i, signal in enumerate(signals_to_plot):
ax = axes[i]
ax.set_title(signal.plot_title,
style='italic',
size='medium')
ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
ax.plot(signal.time_series, color='black')
# Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
# Plot the entropy gradient at the bottom of the overall output
ax = axes[-1]
ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
style='italic',
size='medium')
tang_bit_width = arb_id.tang.shape[0]
ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
y = arb_id.tang[:]
# Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
ix = isin(y, 0)
pad_bit = where(ix)
non_pad_bit = where(~ix)
ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
if not path.exists(arb_id_folder):
mkdir(arb_id_folder)
chdir(arb_id_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig(hex(arb_id.id) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_arb_id()
print("\tComplete...")
a_timer.set_plot_save_arb_id_dict()
def plot_signals_by_cluster(a_timer: PipelineTimer,
cluster_dict: dict,
signal_dict: dict,
use_j1979_tags: bool,
force: bool=False):
if path.exists(cluster_folder):
if force:
rmtree(cluster_folder)
else:
print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
print("\n")
for cluster_number, list_of_signals in cluster_dict.items():
print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
a_timer.start_iteration_time()
# Setup the plot
fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
plt.suptitle("Cluster Number " + str(cluster_number),
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
size_adjust = len(list_of_signals) / 100
# The min() statement provides whitespace for the suptitle depending on the number of subplots.
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
# Plot the time series of each signal in the cluster
for i, signal_key in enumerate(list_of_signals):
signal = signal_dict[signal_key[0]][signal_key]
ax = axes[i, 0]
if signal.j1979_title and use_j1979_tags:
this_title = signal.plot_title + " [" + signal.j1979_title + \
" (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
else:
this_title = signal.plot_title
ax.set_title(this_title,
style='italic',
size='medium')
ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
ax.plot(signal.time_series, color='black')
if not path.exists(cluster_folder):
mkdir(cluster_folder)
chdir(cluster_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig("cluster_" + str(cluster_number) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_cluster()
print("\tComplete...")
a_timer.set_plot_save_cluster_dict()
def plot_j1979(a_timer: PipelineTimer, j1979_dict: dict, force: bool=False):
if path.exists(j1979_folder):
if force:
rmtree(j1979_folder)
else:
print("\nJ1979 plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
print("Plotting J1979 response data")
plot_length = len(j1979_dict.keys())
# Setup the plot
fig, axes = plt.subplots(nrows=plot_length, ncols=1, squeeze=False)
plt.suptitle("J1979 Data",
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + plot_length) * 1.3)
size_adjust = plot_length / 100
# The min() statement provides whitespace for the suptitle depending on the number of subplots.
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
# Plot the time series of each signal in the cluster
for i, (pid, data) in enumerate(j1979_dict.items()):
a_timer.start_iteration_time()
ax = axes[i, 0]
ax.set_title("PID " + str(hex(pid)) + ": " + data.title,
style='italic',
size='medium')
ax.set_xlim([data.data.first_valid_index(), data.data.last_valid_index()])
ax.plot(data.data, color='black')
a_timer.set_plot_save_j1979_pid()
if not path.exists(j1979_folder):
mkdir(j1979_folder)
chdir(j1979_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig("j1979." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_j1979_dict()
print("\tComplete...")

141
Pipeline/PreProcessor.py Normal file
View File

@ -0,0 +1,141 @@
from pandas import DataFrame, read_csv, Series
from numpy import int64
from os import path, remove
from pickle import load
from typing import Callable
from ArbID import ArbID
from J1979 import J1979
from PipelineTimer import PipelineTimer
class PreProcessor:
def __init__(self, data_filename: str, id_output_filename: str, j1979_output_filename: str):
self.data_filename: str = data_filename
self.id_output_filename: str = id_output_filename
self.j1979_output_filename: str = j1979_output_filename
self.data: DataFrame = None
self.import_time: float = 0.0
self.dictionary_time: float = 0.0
self.total_time: float = 0.0
def import_csv(self, a_timer: PipelineTimer, filename):
def hex2int(x):
if x == '':
return 0
return int(x, 16)
def fix_time(x):
try:
return float(str(x)[:-1])
except ValueError:
# There may have been a newline the capture device was trying to write when turned off.
return None
# Used by pd.read_csv to apply the functions to the respective column vectors in the .csv file
convert_dict = {'time': fix_time, 'id': hex2int, 'dlc': hex2int, 'b0': hex2int, 'b1': hex2int, 'b2': hex2int,
'b3': hex2int, 'b4': hex2int, 'b5': hex2int, 'b6': hex2int, 'b7': hex2int}
print("\nReading in " + self.data_filename + "...")
a_timer.start_function_time()
self.data = read_csv(filename,
header=None,
names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
skiprows=7,
delimiter='\t',
converters=convert_dict,
index_col=0)
a_timer.set_can_csv_to_df()
# sanity check output of the original data
# print("\nSample of the original data:")
# print(self.data.head(5), "\n")
@staticmethod
def generate_j1979_dictionary(j1979_data: DataFrame) -> dict:
d = {}
services = j1979_data.groupby('b2')
for uds_pid, data in services:
d[uds_pid] = J1979(uds_pid, data)
return d
def generate_arb_id_dictionary(self,
a_timer: PipelineTimer,
normalize_strategy: Callable,
time_conversion: int = 1000,
freq_analysis_accuracy: float = 0.0,
freq_synchronous_threshold: float = 0.0,
force: bool = False) -> (dict, dict):
if path.isfile(self.id_output_filename):
if force:
# Remove any existing pickled Arb ID dictionary and create one based on this data.
remove(self.id_output_filename)
remove(self.j1979_output_filename)
self.import_csv(a_timer, self.data_filename)
else:
arb_id_dict = load(open(self.id_output_filename, "rb"))
j1979_dict = load(open(self.j1979_output_filename, "rb"))
return arb_id_dict, j1979_dict
else:
self.import_csv(a_timer, self.data_filename)
id_dictionary = {}
j1979_dictionary = {}
a_timer.start_function_time()
for arb_id in Series.unique(self.data['id']):
if isinstance(arb_id, int64):
if arb_id == 2015:
# This is the J1979 requests (if any) (ID 0x7DF = 2015). Just ignore it.
continue
elif arb_id == 2024:
# This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024)
j1979_data = self.data.loc[self.data['id'] == arb_id].copy()
j1979_data.drop('dlc', axis=1, inplace=True)
j1979_data.drop('id', axis=1, inplace=True)
a_timer.start_nested_function_time()
j1979_dictionary = self.generate_j1979_dictionary(j1979_data)
a_timer.set_j1979_creation()
elif arb_id > 0:
a_timer.start_iteration_time()
this_id = ArbID(arb_id)
this_id.original_data = self.data.loc[self.data['id'] == arb_id]
this_id.original_data = this_id.original_data.copy() # type: DataFrame
# Check if the Arbitration ID always used the same DLC. If not, ignore it.
# We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary.
if this_id.original_data['dlc'].nunique() is not 1:
continue
this_id.dlc = this_id.original_data['dlc'].iloc[0]
this_id.original_data.drop('dlc', axis=1, inplace=True)
this_id.original_data.drop('id', axis=1, inplace=True)
# If DLC < 8, we can automatically drop data column vectors > DLC.
# E.G. drop bytes "B7" and "B6" if DLC is 6; those are padding data injected by can-dump and were
# not actually on the bus.
if this_id.dlc < 8:
for i in range(this_id.dlc, 8):
this_id.original_data.drop('b' + str(i), axis=1, inplace=True)
# Check if there are duplicate index values and correct them.
if not this_id.original_data.index.is_unique:
correction_mask = this_id.original_data.index.duplicated()
this_id.original_data = this_id.original_data[~correction_mask]
this_id.generate_binary_matrix_and_tang(a_timer, normalize_strategy)
this_id.analyze_transmission_frequency(time_convert=time_conversion,
ci_accuracy=freq_analysis_accuracy,
synchronous_threshold=freq_synchronous_threshold)
id_dictionary[arb_id] = this_id
a_timer.set_arb_id_creation()
a_timer.set_raw_df_to_arb_id_dict()
return id_dictionary, j1979_dictionary

View File

@ -0,0 +1,336 @@
from pandas import concat, DataFrame, read_csv
from numpy import zeros
from os import path, remove
from pickle import load
from ast import literal_eval
from J1979 import J1979
from Signal import Signal
from PipelineTimer import PipelineTimer
def subset_selection(a_timer: PipelineTimer,
signal_dict: dict = None,
subset_pickle: str = "",
force: bool = False,
subset_size: float = 0.25) -> DataFrame:
if path.isfile(subset_pickle):
if force:
# Remove any existing pickled Signal dictionary and create one.
remove(subset_pickle)
else:
print("\nSubset selection already completed and forcing is turned off. Using pickled data...")
return load(open(subset_pickle, "rb"))
a_timer.start_function_time()
signal_index = 0
for k_arb_id, arb_id_signals in signal_dict.items():
for k_signal_id, signal in arb_id_signals.items():
if not signal.static:
signal_index += 1
# setup subset selection data structure
df: DataFrame = DataFrame(zeros((signal_index, 4)),
columns=["arb_id", "start_index", "stop_index", "Shannon_Index"])
for i, (k_arb_id, arb_id_signals) in enumerate(signal_dict.items()):
for j, (k_signal_id, signal) in enumerate(arb_id_signals.items()):
if not signal.static:
df.iloc[signal_index-1] = [k_arb_id, signal.start_index, signal.stop_index, signal.shannon_index]
signal_index -= 1
# sort by Shannon Index
df.sort_values(by="Shannon_Index", inplace=True, ascending=False)
# Select subset with largest Shannon Index Values
df = df.head(int(round(df.__len__() * subset_size, 0)))
# In order to make an arb ID sorted output, sort this subset by arb_id
df.sort_values(by="arb_id", inplace=True)
# Re-index each Signal in the subset using the Signal with the most observed samples. Prepare to create a DataFrame
# that can be used for generating a correlation matrix.
subset = []
subset_cols = []
largest_index = []
for index, row in df.iterrows():
signal_id = (int(row[0]), int(row[1]), int(row[2]))
signal = signal_dict[row[0]][signal_id]
subset.append(signal)
subset_cols.append(signal_id)
if signal.time_series.__len__() > largest_index.__len__():
largest_index = signal.time_series.index
subset_df: DataFrame = DataFrame(zeros((largest_index.__len__(), subset.__len__())),
columns=subset_cols,
index=largest_index)
for signal in subset:
signal_id = (signal.arb_id, signal.start_index, signal.stop_index)
subset_df[signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
a_timer.set_subset_selection()
return subset_df
def subset_correlation(subset: DataFrame,
csv_correlation_filename: str,
force: bool = False) -> DataFrame:
if not force and path.isfile(csv_correlation_filename):
print("\nA subset correlation appears to exist and forcing is turned off. Using " + csv_correlation_filename)
# Read the .csv into a DataFrame. Also, we need to convert the columns and index from strings back to tuples.
# Pandas.read_csv brings the data in as a DataFrame. Pandas.DataFrame.rename converts the columns and index with
# ast.literal_eval. Literal_eval will convert a string representation of a tuple back to an actual tuple.
return read_csv(csv_correlation_filename, index_col=0).rename(index=literal_eval, columns=literal_eval)
else:
return subset.corr()
def greedy_signal_clustering(correlation_matrix: DataFrame = None,
correlation_threshold: float = 0.8,
fuzzy_labeling: bool = True) -> dict:
correlation_keys = correlation_matrix.columns.values
previously_clustered_signals = {}
cluster_dict = {}
new_cluster_label = 0
for n, row in enumerate(correlation_keys):
for m, col in enumerate(correlation_keys):
if n == m:
# this is a diagonal on the correlation matrix. Skip it.
continue
# I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
result = round(correlation_matrix.iloc[n, m], 2)
# check if this is a significant correlation according to our heuristic threshold.
if result >= correlation_threshold:
# Check if the current row signal is currently unlabeled
if row not in previously_clustered_signals.keys():
# Check if the current col signal is currently unlabeled
if col not in previously_clustered_signals.keys():
# Both signals are unlabeled. Create a new one.
cluster_dict[new_cluster_label] = [row, col]
previously_clustered_signals[row] = {new_cluster_label}
previously_clustered_signals[col] = {new_cluster_label}
# print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
new_cluster_label += 1
else:
# Row isn't labeled but col is; add row to all of col's clusters.
# print("adding", row, "to clusters", previously_clustered_signals[col])
# row is not already in a cluster, add it to col's set of clusters
for label in previously_clustered_signals[col]:
cluster_dict[label].append(row)
previously_clustered_signals[row] = previously_clustered_signals[col]
else:
# Check if the current col signal is currently unlabeled
if col not in previously_clustered_signals.keys():
# Row if labeled but col is not; add col to row's set of clusters
# print("adding", col, "to clusters", previously_clustered_signals[row])
for label in previously_clustered_signals[row]:
cluster_dict[label].append(col)
previously_clustered_signals[col] = previously_clustered_signals[row]
# Both signals are already labeled
else:
# Check if we're using fuzzy labeling (a signal can belong to multiple clusters).
# If so, check if the union of both sets of labels is the empty set. If so, this is a
# relationship that hasn't already been captures by an existing cluster. Make a new one.
if fuzzy_labeling:
row_label_set = previously_clustered_signals[row]
col_label_set = previously_clustered_signals[col]
if not row_label_set & col_label_set:
cluster_dict[new_cluster_label] = [row, col]
previously_clustered_signals[row] = {new_cluster_label} | row_label_set
previously_clustered_signals[col] = {new_cluster_label} | col_label_set
# print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
new_cluster_label += 1
else:
# We're using fuzzy labeling and these two signals represent a 'bridge' between two
# signal clusters. Fold col into row's clusters and delete col's unique cluster indices.
for label in row_label_set - col_label_set:
cluster_dict[label].append(col)
previously_clustered_signals[col] = row_label_set | col_label_set
for label in col_label_set - row_label_set:
cluster_dict[label].append(row)
previously_clustered_signals[row] = row_label_set | col_label_set
# print(row, col, "already in cluster_dict", previously_clustered_signals[row], "&",
# previously_clustered_signals[col])
# Delete any duplicate clusters
cluster_sets = []
deletion_labels = []
for label, cluster in cluster_dict.items():
this_set = set(cluster)
if this_set in cluster_sets:
deletion_labels.append(label)
else:
cluster_sets.append(this_set)
for label in deletion_labels:
del cluster_dict[label]
return cluster_dict
# NOTE: This method has a LOT of redundancy with the subset_selection, signal_correlation, and greedy_signal_clustering
# logic. If you know you always want to perform label propagation, it would be more efficient to incorporate it directly
# into those functions. Since this code base is more of a Proof of Concept, label propagation is deliberately pulled
# out as a distinct method to make the pipeline steps as distinct as possible.
def label_propagation(a_timer: PipelineTimer,
pickle_clusters_filename: str = '',
pickle_all_signals_df_filename: str = '',
csv_signals_correlation_filename: str = '',
signal_dict: dict = None,
cluster_dict: dict = None,
correlation_threshold: float = 0.8,
force: bool = False):
if path.isfile(pickle_all_signals_df_filename) and path.isfile(csv_signals_correlation_filename):
if force:
# Remove any existing data.
remove(pickle_all_signals_df_filename)
remove(csv_signals_correlation_filename)
remove(pickle_clusters_filename)
else:
print("\nA DataFrame and correlation matrix for label propagation appears to exist and forcing is turned "
"off. Using " + pickle_all_signals_df_filename + ", " + csv_signals_correlation_filename + ", and "
+ pickle_clusters_filename)
return [load(open(pickle_all_signals_df_filename, "rb")),
read_csv(csv_signals_correlation_filename, index_col=0).rename(index=literal_eval,
columns=literal_eval),
load(open(pickle_clusters_filename, "rb"))]
a_timer.start_function_time()
non_static_signals_dict = {}
largest_index = []
df_columns = []
# Put all non-static signals into one DataFrame. Re-index all of them to share the same index.
for k_arb_id, arb_id_signals in signal_dict.items():
for k_signal_id, signal in arb_id_signals.items():
if not signal.static:
non_static_signals_dict[k_signal_id] = signal
df_columns.append(k_signal_id)
if signal.time_series.__len__() > largest_index.__len__():
largest_index = signal.time_series.index
df: DataFrame = DataFrame(zeros((largest_index.__len__(), df_columns.__len__())),
columns=df_columns,
index=largest_index)
for k_signal_id, signal in non_static_signals_dict.items():
df[k_signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
# Calculate the correlation matrix for this DataFrame of all non-static signals.
correlation_matrix = df.corr()
# Re-run the algorithm from greedy_signal_clustering but omitting the logic for creating new clusters.
# This effectively propagates the labels generated by the subset of signals with the largest Shannon Index values
# to any correlated signals which were not part of that subset.
correlation_keys = correlation_matrix.columns.values
previously_clustered_signals = {}
for k_cluster_id, cluster in cluster_dict.items():
for k_signal_id in cluster:
previously_clustered_signals[k_signal_id] = k_cluster_id
for n, row in enumerate(correlation_keys):
for m, col in enumerate(correlation_keys):
if n == m:
# this is a diagonal on the correlation matrix. Skip it.
continue
# I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
result = round(correlation_matrix.iloc[n, m], 2)
# check if this is a significant correlation according to our heuristic threshold.
if result >= correlation_threshold:
# if row signal is already a member of a cluster
if row in previously_clustered_signals.keys():
# if col signal is already a member of a cluster
if col in previously_clustered_signals.keys():
# print(row, col, "already in clusters", previously_clustered_signals[row], "&",
# previously_clustered_signals[col])
continue
# if col is not already in a cluster, add it to row's cluster
else:
# print("adding", col, "to cluster", clusters[previously_clustered_signals[row]])
cluster_dict[previously_clustered_signals[row]].append(col)
previously_clustered_signals[col] = previously_clustered_signals[row]
# row signal hasn't been added to a cluster
else:
# if col signal is already a member of a cluster
if col in previously_clustered_signals.keys():
# print("adding", row, "to cluster", clusters[previously_clustered_signals[col]])
# row is not already in a cluster, add it to col's cluster
cluster_dict[previously_clustered_signals[col]].append(row)
previously_clustered_signals[row] = previously_clustered_signals[col]
a_timer.set_label_propagation()
df.dropna(axis=0, how='any', inplace=True)
df.dropna(axis=1, how='any', inplace=True)
return df, correlation_matrix, cluster_dict
def j1979_signal_labeling(a_timer: PipelineTimer,
j1979_corr_filename: str = "",
df_signals: DataFrame = None,
j1979_dict: dict = None,
signal_dict: dict = None,
correlation_threshold: float = 0.8,
force: bool = False):
if path.isfile(j1979_corr_filename):
if force:
# Remove any existing data.
remove(j1979_corr_filename)
else:
print("\nA J1979 correlation matrix for signal labeling appears to exist and forcing is turned off. Using "
+ j1979_corr_filename)
return signal_dict, load(open(j1979_corr_filename, "rb"))
latest_start_index = 0.0
earliest_end_index = 99999999999999.9
df_columns = []
for pid, pid_data in j1979_dict.items(): # type: int, J1979
if latest_start_index < pid_data.data.index[0]:
latest_start_index = pid_data.data.index[0]
if earliest_end_index > pid_data.data.index[-1]:
earliest_end_index = pid_data.data.index[-1]
df_columns.append(pid_data.title)
df_signals = df_signals.loc[latest_start_index:earliest_end_index] # type: DataFrame
df_j1979: DataFrame = DataFrame(zeros((df_signals.shape[0], df_columns.__len__())),
columns=df_columns,
index=df_signals.index)
for pid, pid_data in j1979_dict.items(): # type: int, J1979
df_j1979[pid_data.title] = pid_data.data.reindex(index=df_signals.index, method='nearest')
df_combined = concat([df_signals, df_j1979], axis=1)
correlation_matrix = df_combined.corr()
correlation_matrix.dropna(axis=1, how='all', inplace=True)
correlation_matrix.dropna(axis=0, how='all', inplace=True)
# Just consider the J1979 column correlations. Slice off the identity rows at the end of the DataFrame as well.
for index, row in correlation_matrix[df_columns][:-len(df_columns)].iterrows():
row = abs(row)
max_index = row.idxmax(axis=1, skipna=True)
if row[max_index] >= correlation_threshold:
signal = signal_dict[index[0]][index] # type: Signal
# print("Adding J1979 attribute " + max_index + " to signal ID " + str(index))
signal.j1979_title = max_index
signal.j1979_pcc = row[max_index]
signal_dict[index[0]][index] = signal
# print(i, index, row[max_index], max_index, row.values)
return signal_dict, correlation_matrix[df_columns][:-len(df_columns)]
# correlation_matrix.to_csv('j1979_correlation.csv')

43
Pipeline/Signal.py Normal file
View File

@ -0,0 +1,43 @@
from pandas import Series
from math import log10
class Signal:
def __init__(self, arb_id: int, start_index: int, stop_index: int):
self.arb_id: int = arb_id
self.start_index: int = start_index
self.stop_index: int = stop_index
self.time_series: Series = None
self.static: bool = True
self.shannon_index: float = 0
self.plot_title: str = ""
self.j1979_title: str = None
self.j1979_pcc: float = 0
def normalize_and_set_metadata(self, normalize_strategy):
self.set_shannon_index()
self.update_static()
self.set_plot_title()
self.normalize(normalize_strategy)
def set_shannon_index(self):
si: float = 0.0
n: int = self.time_series.__len__()
for count in self.time_series.value_counts():
# calculate proportion of this integer value in the total population of values
p_i = count / n
# calculate the Shannon Index (si) given p_i of this value.
si += p_i * log10(p_i)
si *= -1
self.shannon_index = si
def update_static(self):
if self.shannon_index >= .000001:
self.static = False
def set_plot_title(self):
self.plot_title = "Time Series from Bit Positions " + str(self.start_index) + " to " + str(self.stop_index) + \
" of Arb ID " + hex(int(self.arb_id))
def normalize(self, normalize_strategy):
normalize_strategy(self.time_series.values, copy=False)

View File

@ -0,0 +1,97 @@
from typing import Callable, List
from pandas import DataFrame
from numpy import float64, logical_xor, mean, ndarray, sqrt, std, sum, uint8, zeros
from PipelineTimer import PipelineTimer
# noinspection PyArgumentList
class ArbID:
def __init__(self, arb_id: int):
self.id: int = arb_id
# These features are set by PreProcessing.py's generate_arb_id_dictionary
self.dlc: int = 0
self.original_data: DataFrame = None
# These features are set in generate_binary_matrix_and_tang called by generate_arb_id_dictionary
self.boolean_matrix: ndarray = None
self.tang: ndarray = None
# Static and short are just book keeping flags to let other methods know this Arb ID prob isn't worth analyzing
self.static: bool = True
self.short: bool = True
# These features are set in analyze_transmission_frequency called by generate_arb_id_dictionary
self.ci_sensitivity: float = 0.0
self.freq_mean: float = 0.0
self.freq_std: float = 0.0
self.freq_ci: tuple = None
self.mean_to_ci_ratio: float = 0.0
self.synchronous: bool = False
# These features are set by LexicalAnalysis.py's get_composition
self.tokenization: List[tuple] = []
self.padding: List[int] = []
@staticmethod
def generate_tang(boolean_matrix):
transition_matrix = logical_xor(boolean_matrix[:-1, ], boolean_matrix[1:, ])
return sum(transition_matrix, axis=0, dtype=float64)
def generate_binary_matrix_and_tang(self, a_timer: PipelineTimer, normalize_strategy: Callable):
a_timer.start_nested_function_time()
self.boolean_matrix = zeros((self.original_data.__len__(), self.dlc * 8), dtype=uint8)
for i, row in enumerate(self.original_data.itertuples()):
for j, cell in enumerate(row[1:]):
# Skip cells that were already 0
if cell > 0:
# i is the row in the boolean_matrix
# j*8 is the left hand bit for this byte in the payload
# j*8 + 8 is the right hand bit for this byte in the payload
# e.g. byte index 1 starts at bits 1*8 = 8 to 1*8+8 = 16; [8:16]
# likewise, byte index 7 starts at bits 7*8 = 56 to 7*8+8 = 64
# Numpy indexing is non-inclusive of the upper bound. So [0:8] is the first 8 elements
bin_string = format(cell, '08b')
self.boolean_matrix[i, j * 8:j * 8 + 8] = [x == '1' for x in bin_string]
a_timer.set_hex_to_bool_matrix()
if self.boolean_matrix.shape[0] > 1:
a_timer.start_nested_function_time()
self.tang = self.generate_tang(boolean_matrix=self.boolean_matrix)
# Ensure there is no divide by zero issues caused by an all zero tang vector
if max(self.tang) > 0:
# TODO: This conditional path should account for there only being one value in all the signals.
# see Plotter.py plot_signals_by_arb_id() for how this crashes plotting.
normalize_strategy(self.tang, axis=0, copy=False)
self.static = False
if self.original_data.shape[0] > 4:
self.short = False
a_timer.set_bool_matrix_to_tang()
def analyze_transmission_frequency(self,
time_convert: int = 1000,
ci_accuracy: float = 1.645,
synchronous_threshold: float = 0.1):
if self.short or self.original_data.__len__() < 4:
return
self.ci_sensitivity = ci_accuracy
# time_convert = 1000 is intended to convert seconds to milliseconds.
freq_intervals = self.original_data.index[1:] - self.original_data.index[:-1]
self.freq_mean = mean(freq_intervals) * time_convert
self.freq_std = std(freq_intervals, ddof=1)*time_convert
# Assumes distribution of freq_intervals is gaussian normal.
mean_offset = ci_accuracy * self.freq_std / sqrt(len(freq_intervals))
self.freq_ci = (self.freq_mean - mean_offset, self.freq_mean + mean_offset)
# mean_to_ci_ratio is the ratio of the CI range to the mean value. This is to provide heuristic to determine
# if this Arb ID can be called 'synchronous' given the time scale of its average transmission frequency.
# For example, imagine an Arb ID transmitting with a mean frequency of exactly one second. Its 90% Confidence
# Interval spans 50 milliseconds about that ideal mean. It's reasonable to assume this Arb ID was engineered to
# be a 'synchronous' signal. However, a different Arb ID with a mean frequency of 40 milliseconds and the same
# confidence interval could not be reasonably assumed to have been engineered to be 'synchronous'. This other
# Arb ID was engineered to be 'high frequency' compared to the first example, but there isn't evidence that it's
# intended to be a high frequency 'synchronous' process that adheres to a certain clock frequency.
# This inference relies upon the assumption that the OEM didn't engineer the bus to be a train wreck with some
# IDs losing an arbitrarily large percentage of their arbitration phases.
self.mean_to_ci_ratio = 2*mean_offset/self.freq_mean
if self.mean_to_ci_ratio <= synchronous_threshold:
self.synchronous = True

View File

@ -0,0 +1,76 @@
from os import chdir, path, getcwd, walk
import re
from Sample import Sample
can_data_filename: str = ''
class FileBoi:
def __init__(self):
self.sample_dict = self.go_fetch()
# path.exists(output_folder):
# mkdir(output_folder)
# chdir(output_folder)
@staticmethod
def go_fetch(kfold_n: int = 5):
# NOTE: File structure relative to this script is expected to be as follows.
# .
# +-- Captures
# | +-- Make x_0
# | | +-- Model y_0
# | | | +-- ModelYear z_0
# | | | | +-- Samples
# | | | | | +-- sample = re.match('loggerProgram[\d]+.log', a_file_in_this_folder)
# | +-- Make x_1.... etc.
# +-- Some folder
# | +-- The directory with these scripts
# | | +-- this_script.py
script_dir: str = getcwd()
chdir("../../")
if not path.exists("Captures"):
# Make sure your local directory structure matches the example above. If not... adjust accordingly
print("Error finding Captures folder. Please check the relative path between this script and Captures.")
print("See the source of go_fetch() in FileBoi.py for an example of the expected relative paths.")
chdir(script_dir)
quit()
chdir("Captures")
root_dir = getcwd()
make: str = ""
model: str = ""
year: str = ""
current_vehicle = []
sample_dict = {}
for dirName, subdirList, fileList in walk(root_dir, topdown=True):
this_dir = path.basename(dirName)
if len(subdirList) == 0:
if len(current_vehicle) == 3:
make = current_vehicle[0]
model = current_vehicle[1]
year = current_vehicle[2]
elif len(current_vehicle) == 2:
model = current_vehicle[0]
year = current_vehicle[1]
elif len(current_vehicle) == 1 and current_vehicle != "":
year = current_vehicle[0]
for file in fileList:
# Check if this file name matches the expected name for a CAN data sample. If so, create new Sample
m = re.match('loggerProgram[\d]+.log', file)
if m:
if not (make, model, year) in sample_dict:
sample_dict[(make, model, year)] = []
this_sample_index = str(len(sample_dict[(make, model, year)]))
this_sample = Sample(make=make, model=model, year=year, sample_index=this_sample_index,
sample_path=dirName + "\\" + m.group(0), kfold_n=kfold_n)
sample_dict[(make, model, year)].append(this_sample)
current_vehicle = []
else:
if this_dir == "Captures":
continue
current_vehicle.append(this_dir)
chdir(script_dir)
return sample_dict

View File

@ -0,0 +1,97 @@
from pandas import DataFrame, Series
from numpy import int8, float16, uint8, uint16
class J1979:
def __init__(self, pid: int, original_data: DataFrame):
self.pid: int = pid
self.title: str = ""
self.data: Series = self.process_response_data(original_data)
print("Found " + str(self.data.shape[0]) + " responses for J1979 PID " + str(hex(self.pid)) + ":", self.title)
def process_response_data(self, original_data) -> Series:
# ISO-TP formatted Universal Diagnostic Service (UDS) requests that were sent by the CAN collection device
# during sampling. Request made using Arb ID 0x7DF with DLC of 8. Response should use Arb ID 7E8 (0x7DF + 0x8).
# DataFrame Columns: b0 b1 b2 b3 ... b7
# -- -- -- -- --
# PID 0x0C (12 dec) (Engine RPM): 02 01 0c 00 ... 00
# PID 0x0D (13 dec) (Vehicle Speed): 02 01 0d 00 ... 00
# PID 0x11 (17 dec) (Trottle Pos.): 02 01 11 00 ... 00
# PID 0x61 (97 dec) (Demand % Torque): 02 01 61 00 ... 00
# PID 0x62 (98 dec) (Engine % Torque): 02 01 62 00 ... 00
# PID 0x63 (99 dec) (Ref. Torque): 02 01 63 00 ... 00
# PID 0x8e (142 dec) (Friction Torque): 02 01 8e 00 ... 00
# Responses being managed here should follow the ISO-TP + UDS per-byte format AA BB CC DD .. DD
# BYTE: AA BB CC DD ... DD
# USE: response size (bytes) UDS mode + 0x40 UDS PID response data
# DF COLUMN: b0 b1 b2 b3 ... b7
# Remember that this response data is already converted to decimal. Thus, byte BB = 65 = 0x41 = 0x01 + 0x40.
# If BB isn't 0x41, check what the error code is. Some error code are listed in the UDS chapter of the car
# hacker's handbook available at http://opengarages.org/handbook/ebook/.
if self.pid == 12:
self.title = 'Engine RPM'
# PID is 0x0C: Engine RPM. 2 byte of data AA BB converted using 1/4 RPM per bit: (256*AA+BB)/4
# Min value: 0 Max value: 16,383.75 units: rpm
return Series(data=(256*original_data['b3']+original_data['b4'])/4,
index=original_data.index,
name=self.title,
dtype=float16)
elif self.pid == 13:
self.title = 'Speed km/h'
# PID is 0x0D: Vehicle Speed. 1 byte of data AA using 1km/h per bit: no conversion necessary
# Min value: 0 Max value: 255 (158.44965mph) units: km/h
return Series(data=original_data['b3'],
index=original_data.index,
name=self.title,
dtype=uint8)
elif self.pid == 17:
self.title = 'Throttle %'
# PID is 0x11: Throttle Position. 1 byte of data AA using 100/255 % per bit: AA * 100/255% throttle.
# Min value: 0 Max value: 100 units: %
return Series(data=100 * original_data['b3'] / 255,
index=original_data.index,
name=self.title,
dtype=uint8)
elif self.pid == 97:
self.title = 'Demand Torque %'
# PID is 0x61: Driver's demand engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
# AA - 125
# Min value: -125 Max value: 130 units: %
return Series(data=original_data['b3'] - 125,
index=original_data.index,
name=self.title,
dtype=int8)
elif self.pid == 98:
self.title = 'Actual Torque %'
# PID is 0x62: Actual engine - percent torque. 1 byte of data AA using 1%/bit with -125 offset
# AA - 125
# Min value: -125 Max value: 130 units: %
return Series(data=original_data['b3'] - 125,
index=original_data.index,
name=self.title,
dtype=int8)
elif self.pid == 99:
self.title = 'Reference Torque Nm'
# PID is 0x63: Engine reference torque. 2 byte of data AA BB using 1 Nm/bit: 256*AA + BB Nm torque
# Min value: 0 Max value: 65,535 units: Nm
return Series(data=256*original_data['b3'] + original_data['b4'],
index=original_data.index,
name=self.title,
dtype=uint16)
elif self.pid == 142:
self.title = 'Engine Friction Torque %'
# PID is 0x8E: Engine Friction - Percent Torque. 1 byte of data AA using 1%/bit with -125 offset. AA - 125
# Min value: -125 Max value: 130 units: %
return Series(data=original_data['b3'] - 125,
index=original_data.index,
name=self.title,
dtype=int8)
else:
# Looks like you were requesting J1979 data with your sniffer that hasn't been implemented in this code.
# Time to do the leg work to expand this class accordingly then re-run the pipeline.
raise ValueError("Encountered J1979 PID " + str(hex(self.pid)) +
" in Pre-Processing that hasn't been programmed. Expand the J1979 class to handle all "
"J1979 requests made during data collection.")

View File

@ -0,0 +1,189 @@
from numpy import float64, nditer, uint64, zeros, ndarray
from pandas import Series
from os import path, remove
from pickle import load
from ArbID import ArbID
from Signal import Signal
from PipelineTimer import PipelineTimer
from typing import List
def tokenize_dictionary(a_timer: PipelineTimer,
d: dict,
force: bool = False,
include_padding: bool = False,
merge: bool = True,
max_distance: float= 0.1):
a_timer.start_function_time()
for k, arb_id in d.items():
if not arb_id.static:
if arb_id.padding and not force:
print("\nTokenization already completed and forcing is turned off. Skipping...")
return
a_timer.start_iteration_time()
get_composition(arb_id, include_padding, max_distance)
a_timer.set_tang_to_composition()
if merge:
a_timer.start_iteration_time()
merge_tokens(arb_id, max_distance)
a_timer.set_composition_merge()
a_timer.set_tokenization()
# This is a greedy algorithm to cluster bit positions in a series of CAN payloads suspected of being part of a
# continuous numerical time series.
def get_composition_just_tang(this_tang: ndarray, include_padding=False, max_inversion_distance: float = 0.0):
tokens: List[tuple] = []
padding = []
start_index = 0
currently_clustering = False
big_endian = True
last_bit_position = 0
# Consider each element in the TANG. The TANG is an ndarray with index being bit position from the
# original CAN data. The cell value is the observed transition frequency for that bit position.
for i, bit_position in enumerate(nditer(this_tang)):
# Is this a padding bit?
if bit_position <= 0.000001:
padding.append(i)
# Are we clustering padding bits? If so, proceed to the normal clustering logic. Else, do the following.
if not include_padding:
if currently_clustering:
# This is padding, we're already clustering, and we're not clustering padding; save the token.
tokens.append((start_index, i - 1))
currently_clustering = False
start_index = i + 1
last_bit_position = bit_position
continue
# Are we still enlarging the current token?
if currently_clustering:
if bit_position >= last_bit_position and big_endian:
pass
elif bit_position <= last_bit_position and not big_endian:
pass
# Are we allowing inversions (max_inversion_distance > 0)? If so, check if this inversion is acceptable.
elif abs(bit_position - last_bit_position) <= max_inversion_distance:
pass
# Is this the second bit position we need to establish the endian of the signal?
elif start_index == i - 1:
if bit_position >= last_bit_position:
big_endian = True
else:
big_endian = False
# This is an unacceptable transition frequency inversion, save the current token and start a new one
else:
tokens.append((start_index, i - 1))
start_index = i
# We aren't currently clustering and we intend to cluster this bit position
else:
currently_clustering = True
start_index = i
last_bit_position = bit_position
# We reached the last bit position while clustering. Add this final token.
if currently_clustering:
tokens.append((start_index, this_tang.__len__() - 1))
return tokens, padding
def get_composition(arb_id: ArbID, include_padding=False, max_inversion_distance: float = 0.0):
arb_id.tokenization, arb_id.padding = \
get_composition_just_tang(arb_id.tang, include_padding, max_inversion_distance)
def merge_tokens_just_composition(tokens: list, this_tang, max_distance: float):
verbose = False
if tokens.__len__() < 2:
# Make sure there's multiple tokens to marge
pass
else:
# Editing data structures while iterating over them is a bad idea in Python
# Instead, lets keep track of tokens we want to delete using remove_list.
remove_list = []
last = None
for i, token in enumerate(tokens):
if verbose:
print("Last:", last, "\tCurrent:", token)
if last:
# Are these tokens adjacent?
if last[1] + 1 == token[0]:
if verbose:
print("\tAdjacent with distance of", abs(this_tang[last[1]] - this_tang[token[0]]))
# Is the transition frequency of the adjacent bit positions less than the max distance threshold?
if abs(this_tang[last[1]] - this_tang[token[0]]) <= max_distance:
remove_list.append(last)
token = (last[0], token[1])
tokens[i] = token
if verbose:
print("\t\tMerged into", token)
last = token
if remove_list:
for token in remove_list:
tokens.remove(token)
if verbose:
print("final tokenization", tokens)
return tokens
def merge_tokens(arb_id: ArbID, max_distance):
# if arb_id.id == 292: # make this equal to the decimal value of an Arb ID in the data you want to see get merged
# verbose = True
# else:
verbose = False
if verbose:
print("\nENTERING MERGE PHASE OF ARB ID", arb_id.id)
print("STARTING TOKENS:", arb_id.tokenization)
arb_id.tokenization = merge_tokens_just_composition(arb_id.tokenization, arb_id.tang, max_distance)
# noinspection PyTypeChecker
def generate_signals(a_timer: PipelineTimer,
arb_id_dict: dict,
signal_pickle_filename: str,
normalize_strategy,
force=False):
if force and path.isfile(signal_pickle_filename):
remove(signal_pickle_filename)
if path.isfile(signal_pickle_filename):
print("\nSignal generation already completed and forcing is turned off. Using pickled data...")
return load(open(signal_pickle_filename, "rb"))
a_timer.start_function_time()
signal_dict = {}
for k, arb_id in arb_id_dict.items():
if not arb_id.static:
for token in arb_id.tokenization:
a_timer.start_iteration_time()
signal = Signal(k, token[0], token[1])
# Convert the binary ndarray to a list of string representations of each row
temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
temp2 = zeros((temp1.__len__(), 1), dtype=uint64)
# convert each string representation to int
for i, row in enumerate(temp1):
temp2[i] = int(row, 2)
# create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
signal.time_series = Series(temp2[:, 0], index=arb_id.original_data.index, dtype=float64)
# Normalize the signal and update its meta-data
signal.normalize_and_set_metadata(normalize_strategy)
# add this signal to the signal dictionary which is keyed by Arbitration ID
if k in signal_dict:
signal_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
else:
signal_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
a_timer.set_token_to_signal()
a_timer.set_signal_generation()
return signal_dict

View File

@ -0,0 +1,38 @@
from FileBoi import FileBoi
from Sample import Sample
# from Plotter import plot_sample_threshold_heatmap
# Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING?
kfold_n: int = 5
current_vehicle_number = 0
good_boi = FileBoi()
samples = good_boi.go_fetch(kfold_n)
for key, sample_list in samples.items(): # type: tuple, list
for sample in sample_list: # type: Sample
print(current_vehicle_number)
print("\nData import and Pre-Processing for " + sample.output_vehicle_dir)
id_dict, j1979_dict = sample.pre_process()
if j1979_dict:
sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number))
# The following 3-lines of code were intended to find good settings for TANG inversions.... it didn't work?
# print("\nFinding optimal lexical analysis threshold parameters for " + sample.output_vehicle_dir)
# sample.find_lex_thresholds(id_dict)
# plot_sample_threshold_heatmap(sample)
# LEXICAL ANALYSIS #
print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
sample.tokenize_dictionary(id_dict)
signal_dict = sample.generate_signals(id_dict, bool(j1979_dict))
sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
# LEXICAL ANALYSIS #
print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####")
corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict)
if j1979_dict:
signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df)
cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix)
sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number))
current_vehicle_number += 1

View File

@ -0,0 +1,149 @@
from time import time
from typing import List
class PipelineTimer:
def __init__(self, verbose: bool = True):
self.verbose: bool = verbose
self.function_time: float = 0.0
self.nested_function_time: float = 0.0
self.iteration_time: float = 0.0
self.nested_iteration_time: float = 0.0
# Pre-Processing Timings
self.can_csv_to_df: float = 0.0
self.raw_df_to_arb_id_dict: float = 0.0
self.arb_id_creation: List[float] = []
self.j1979_creation: float = 0.0
self.hex_to_bool_matrix: List[float] = []
self.bool_matrix_to_tang: List[float] = []
self.plot_save_j1979_dict: float = 0.0
self.plot_save_j1979_pid: List[float] = []
# Lexical Analysis Timings
self.tokenization: float = 0.0
self.tang_to_composition: List[float] = []
self.composition_merge: List[float] = []
self.signal_generation: float = 0.0
self.token_to_signal: List[float] = []
self.plot_save_arb_id_dict: float = 0.0
self.plot_save_arb_id: List[float] = []
# Semantic Analysis Timings
self.subset_selection: float = 0.0
self.plot_save_cluster_dict: float = 0.0
self.label_propagation: float = 0.0
self.plot_save_cluster: List[float] = []
def start_function_time(self):
self.function_time = time()
def start_nested_function_time(self):
self.nested_function_time = time()
def start_iteration_time(self):
self.iteration_time = time()
def start_nested_iteration_time(self):
self.nested_iteration_time = time()
# Pre-Processing Timings #
# Called in PreProcessor.import_csv
def set_can_csv_to_df(self):
self.can_csv_to_df = time() - self.function_time
if self.verbose:
print("\n" + str(self.can_csv_to_df) + " seconds to import and format raw data into a DataFrame")
# Called in PreProcessor.generate_arb_id_dictionary
def set_raw_df_to_arb_id_dict(self):
self.raw_df_to_arb_id_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.raw_df_to_arb_id_dict) +
" seconds to produce arbitration ID dictionary, boolean matrices, and TANGs")
# Called in the loop within PreProcessor.generate_arb_id_dictionary
def set_arb_id_creation(self):
self.arb_id_creation.append(time() - self.iteration_time)
def set_j1979_creation(self):
self.j1979_creation = time() - self.nested_function_time
if self.verbose:
print("\n" + str(self.j1979_creation) + " seconds to process J1979 response data into a dictionary")
# Called in ArbID.generate_binary_matrix_and_tang
def set_hex_to_bool_matrix(self):
self.hex_to_bool_matrix.append(self.nested_function_time - time())
# Called in ArbID.generate_binary_matrix_and_tang
def set_bool_matrix_to_tang(self):
self.bool_matrix_to_tang.append(self.nested_function_time - time())
# Called in the Plotter.py plot_j1979 function.
def set_plot_save_j1979_dict(self):
self.plot_save_j1979_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.plot_save_j1979_dict) + " seconds to plot and save the J1979 response data")
# Called in the loop within the Plotter.py plot_j1979 function.
def set_plot_save_j1979_pid(self):
self.plot_save_j1979_pid.append(time() - self.iteration_time)
# Lexical Analysis Timings #
# Called in the LexicalAnalysis.py tokenize_dictionary function.
def set_tokenization(self):
self.tokenization = time() - self.function_time
if self.verbose:
print("\n" + str(self.tokenization) + " seconds to tokenize the arbitration ID dictionary using TANGs")
# Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
def set_tang_to_composition(self):
self.tang_to_composition.append(time() - self.iteration_time)
# Called in the loop within the LexicalAnalysis.py tokenize_dictionary function.
def set_composition_merge(self):
self.composition_merge.append(time() - self.iteration_time)
# Called in the LexicalAnalysis.py generate_signals function.
def set_signal_generation(self):
self.signal_generation = time() - self.function_time
if self.verbose:
print("\n" + str(self.signal_generation) +
" seconds to generate signals and their statistics using token compositions.")
# Called in the loop within the LexicalAnalysis.py generate_signals function.
def set_token_to_signal(self):
self.token_to_signal.append(time() - self.iteration_time)
# Called in the Plotter.py plot_signals_by_arb_id function.
def set_plot_save_arb_id_dict(self):
self.plot_save_arb_id_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.plot_save_arb_id_dict) + " seconds to plot and save the Signals and TANGs by Arb ID")
# Called in the loop within the Plotter.py plot_signals_by_arb_id function.
def set_plot_save_arb_id(self):
self.plot_save_arb_id.append(time() - self.iteration_time)
# Semantic Analysis Timings #
# Called in the SemanticAnalysis.py subset_correlation function.
def set_subset_selection(self):
self.subset_selection = time() - self.function_time
# Called in the SemanticAnalysis.py label_propagation function.
def set_label_propagation(self):
self.label_propagation = time() - self.function_time
if self.verbose:
print("\n" + str(self.label_propagation) + " seconds to perform label propagation.")
# Called in the Plotter.py plot_signals_by_cluster function.
def set_plot_save_cluster_dict(self):
self.plot_save_cluster_dict = time() - self.function_time
if self.verbose:
print("\n" + str(self.plot_save_cluster_dict) + " seconds to plot and save the clusters.")
# Called in the loop within the Plotter.py plot_signals_by_cluster function.
def set_plot_save_cluster(self):
self.plot_save_cluster.append(time() - self.iteration_time)

View File

@ -0,0 +1,313 @@
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from numpy import where, isin
from os import chdir, mkdir, path, remove
from shutil import rmtree
from PipelineTimer import PipelineTimer
from scipy.cluster.hierarchy import dendrogram
figure_format: str = "png" # png works well for quickly previewing results and draft results write ups.
# figure_format: str = "eps" # Journals/Conferences generally prefer EPS file format for camera-ready copies.
figure_dpi: int = 300
figure_transp: bool = False
arb_id_folder: str = 'figures'
cluster_folder: str = 'clusters'
j1979_folder: str = 'j1979'
threshold_folder: str = 'threshold_heatmaps'
def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str,
force: bool=False):
if path.exists(arb_id_folder):
if force:
rmtree(arb_id_folder)
else:
print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
for k_id, signals in signal_dict.items():
arb_id = arb_id_dict[k_id]
if not arb_id.static and not arb_id.short:
print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
a_timer.start_iteration_time()
signals_to_plot = []
# Don't plot the static signals
for k_signal, signal in signals.items():
if not signal.static:
signals_to_plot.append(signal)
# There's a corner case where the Arb ID only has static signals. This conditional accounts for this.
# TODO: This corner case should probably be reflected by arb_id.static.
if len(signals_to_plot) < 1:
continue
# One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors.
fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number,
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
# The min() statement provides whitespace for the title depending on the number of subplots.
size_adjust = len(signals_to_plot) / 100
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
for i, signal in enumerate(signals_to_plot):
ax = axes[i]
ax.set_title(signal.plot_title,
style='italic',
size='medium')
ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
ax.plot(signal.time_series, color='black')
# Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
# Plot the entropy gradient at the bottom of the overall output
ax = axes[-1]
ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
style='italic',
size='medium')
tang_bit_width = arb_id.tang.shape[0]
ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
y = arb_id.tang[:]
# Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
ix = isin(y, 0)
pad_bit = where(ix)
non_pad_bit = where(~ix)
ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
if not path.exists(arb_id_folder):
mkdir(arb_id_folder)
chdir(arb_id_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig(hex(arb_id.id) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_arb_id()
print("\tComplete...")
a_timer.set_plot_save_arb_id_dict()
def plot_signals_by_cluster(a_timer: PipelineTimer,
cluster_dict: dict,
signal_dict: dict,
use_j1979_tags: bool,
vehicle_number: str,
force: bool=False):
if path.exists(cluster_folder):
if force:
rmtree(cluster_folder)
else:
print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
print("\n")
for cluster_number, list_of_signals in cluster_dict.items():
print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
a_timer.start_iteration_time()
# Setup the plot
fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number,
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
size_adjust = len(list_of_signals) / 100
# The min() statement provides whitespace for the suptitle depending on the number of subplots.
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
# Plot the time series of each signal in the cluster
for i, signal_key in enumerate(list_of_signals):
signal = signal_dict[signal_key[0]][signal_key]
ax = axes[i, 0]
if signal.j1979_title and use_j1979_tags:
this_title = signal.plot_title + " [" + signal.j1979_title + \
" (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
else:
this_title = signal.plot_title
ax.set_title(this_title,
style='italic',
size='medium')
ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
ax.plot(signal.time_series, color='black')
if not path.exists(cluster_folder):
mkdir(cluster_folder)
chdir(cluster_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig("cluster_" + str(cluster_number) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_cluster()
print("\tComplete...")
a_timer.set_plot_save_cluster_dict()
def plot_j1979(a_timer: PipelineTimer, j1979_dict: dict, vehicle_number: str, force: bool=False):
if path.exists(j1979_folder):
if force:
rmtree(j1979_folder)
else:
print("\nJ1979 plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
print("Plotting J1979 response data")
plot_length = len(j1979_dict.keys())
# Setup the plot
fig, axes = plt.subplots(nrows=plot_length, ncols=1, squeeze=False)
plt.suptitle("J1979 Data Collected from Vehicle " + vehicle_number,
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + plot_length) * 1.3)
size_adjust = plot_length / 100
# The min() statement provides whitespace for the suptitle depending on the number of subplots.
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
# Plot the time series of each signal in the cluster
for i, (pid, data) in enumerate(j1979_dict.items()):
a_timer.start_iteration_time()
ax = axes[i, 0]
ax.set_title("PID " + str(hex(pid)) + ": " + data.title,
style='italic',
size='medium')
ax.set_xlim([data.data.first_valid_index(), data.data.last_valid_index()])
ax.plot(data.data, color='black')
a_timer.set_plot_save_j1979_pid()
if not path.exists(j1979_folder):
mkdir(j1979_folder)
chdir(j1979_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig("j1979." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_j1979_dict()
print("\tComplete...")
def plot_sample_threshold_heatmap(sample):
this_figure_name = "alignment_scores_" + sample.output_vehicle_dir + "." + figure_format
if path.exists(threshold_folder):
chdir(threshold_folder)
if path.isfile(this_figure_name):
if sample.force_threshold_plot:
remove(this_figure_name)
else:
print("\nThreshold heatmap plotting for " + sample.output_vehicle_dir + " already complete.")
print("\tHeatmap plot forcing is turned off. Skipping...")
chdir("..")
return
chdir("..")
print("\tPlotting threshold parameter-Alignment Score heatmap for " + sample.output_vehicle_dir)
if not path.exists(threshold_folder):
mkdir(threshold_folder)
chdir(threshold_folder)
fig, ax = plt.subplots()
halfway_mark = int(round(sample.avg_score_matrix.shape[0]/2, 0))
sample.avg_score_matrix = sample.avg_score_matrix[0:halfway_mark, 0:halfway_mark].astype(float)
sample.validator.set_lex_threshold_parameters(sample)
im = ax.imshow(sample.avg_score_matrix, cmap='inferno', interpolation='none', vmin=0.0, vmax=1.0)
# im = ax.imshow(sample.avg_score_matrix, cmap='Greys', interpolation='nearest')
# ax.set_xticks(arange(sample.avg_score_matrix.shape[1]))
# ax.set_yticks(arange(sample.avg_score_matrix.shape[0]))
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel("Alignment Score", rotation=-90, va="bottom")
ax.set_title("Average Alignment Score as a Function of \n Lexical Analysis Threshold Parameters for " +
sample.year + " " + sample.make + " " + sample.model + "\nInversion: " +
str(round(sample.optimal_bit_dist, 2)) + " Merge: " + str(round(sample.optimal_merge_dist, 2)) +
" Score: " +
str(round(sample.avg_score_matrix[sample.optimal_bit_dist, sample.optimal_merge_dist], 2)))
fig.tight_layout()
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig(this_figure_name,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
plt.close()
chdir("..")
print("\t\tComplete...")
def plot_dendrogram(a_timer: PipelineTimer,
linkage_matrix,
threshold: float,
vehicle_number: str,
force: bool = False):
dendrogram_filename = "dendrogram_" + vehicle_number + "." + figure_format
if path.isfile(dendrogram_filename):
if force:
remove(dendrogram_filename)
else:
print("Dendrogram already plotted. Skipping...")
return
plt.figure(figsize=(7, 7), dpi=600)
R: dict = dendrogram(Z=linkage_matrix, orientation='top', distance_sort='ascending', no_labels=True)
plt.title("Dendrogram of Agglomerative Clustering for Vehicle " + vehicle_number)
plt.xlabel("Signals Observed")
plt.ylabel("Single Linkage Cluster Merge Distance")
xmin, xmax = plt.xlim()
# Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
plt.hlines(y=threshold, xmin=xmin, xmax=xmax, alpha=0.25, colors='black', linestyle='dashed',
label='cluster threshold')
plt.legend(loc='upper right')
print("\tPlotting dendrogram and saving to " + dendrogram_filename)
savefig(dendrogram_filename,
bbox_iches='tight',
pad_inches=0.0,
dpi=600,
format=figure_format,
transparent=figure_transp)
plt.close()
print("\t\tComplete...")

View File

@ -0,0 +1,148 @@
from pandas import DataFrame, read_csv, Series
from numpy import int64
from os import path, remove, getcwd
from pickle import load
from typing import Callable
from ArbID import ArbID
from J1979 import J1979
from PipelineTimer import PipelineTimer
class PreProcessor:
def __init__(self, data_filename: str, id_output_filename: str, j1979_output_filename: str, use_j1979: bool):
self.data_filename: str = data_filename
self.id_output_filename: str = id_output_filename
self.j1979_output_filename: str = j1979_output_filename
self.data: DataFrame = None
self.import_time: float = 0.0
self.dictionary_time: float = 0.0
self.total_time: float = 0.0
self.use_j1979: bool = use_j1979
def import_csv(self, a_timer: PipelineTimer, filename):
def hex2int(x):
if x == '':
return 0
return int(x, 16)
def fix_time(x):
try:
return float(str(x)[:-1])
except ValueError:
# There may have been a newline the capture device was trying to write when turned off.
return None
# Used by pd.read_csv to apply the functions to the respective column vectors in the .csv file
convert_dict = {'time': fix_time, 'id': hex2int, 'dlc': hex2int, 'b0': hex2int, 'b1': hex2int, 'b2': hex2int,
'b3': hex2int, 'b4': hex2int, 'b5': hex2int, 'b6': hex2int, 'b7': hex2int}
print("\nReading in " + self.data_filename + "...")
a_timer.start_function_time()
self.data = read_csv(filename,
header=None,
names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
skiprows=7,
delimiter='\t',
converters=convert_dict,
index_col=0)
a_timer.set_can_csv_to_df()
# sanity check output of the original data
# print("\nSample of the original data:")
# print(self.data.head(5), "\n")
@staticmethod
def generate_j1979_dictionary(j1979_data: DataFrame) -> dict:
d = {}
services = j1979_data.groupby('b2')
for uds_pid, data in services:
d[uds_pid] = J1979(uds_pid, data)
return d
def generate_arb_id_dictionary(self,
a_timer: PipelineTimer,
normalize_strategy: Callable,
time_conversion: int = 1000,
freq_analysis_accuracy: float = 0.0,
freq_synchronous_threshold: float = 0.0,
force: bool = False) -> (dict, dict):
id_dictionary = {}
j1979_dictionary = {}
if force:
# Remove any existing pickled Arb ID and J1979 dictionaries and create new ones based on data_filename.
if path.isfile(self.id_output_filename):
remove(self.id_output_filename)
if path.isfile(self.j1979_output_filename):
remove(self.j1979_output_filename)
self.import_csv(a_timer, self.data_filename)
elif path.isfile(self.id_output_filename):
# This logic assumes that there will be a J1979 dict if and only if there is an Arb ID dict
print("\tLoading Arb ID dictionary from pickled data: " + getcwd() + "\\" + self.id_output_filename)
id_dictionary = load(open(self.id_output_filename, "rb"))
if path.isfile(self.j1979_output_filename):
print("\tLoading J1979 dictionary from pickled data: " + getcwd() + "\\" + self.j1979_output_filename)
j1979_dictionary = load(open(self.j1979_output_filename, "rb"))
print("\tSet 'force_pre_processing' in Sample.py to True to re-compute instead...")
return id_dictionary, j1979_dictionary
else:
self.import_csv(a_timer, self.data_filename)
a_timer.start_function_time()
for arb_id in Series.unique(self.data['id']):
if isinstance(arb_id, int64):
if arb_id == 2015:
# This is the J1979 requests (if any) (ID 0x7DF = 2015). Just ignore it.
continue
elif arb_id == 2024 and self.use_j1979:
# This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024)
j1979_data = self.data.loc[self.data['id'] == arb_id].copy()
j1979_data.drop('dlc', axis=1, inplace=True)
j1979_data.drop('id', axis=1, inplace=True)
a_timer.start_nested_function_time()
j1979_dictionary = self.generate_j1979_dictionary(j1979_data)
a_timer.set_j1979_creation()
elif arb_id > 0:
a_timer.start_iteration_time()
this_id = ArbID(arb_id)
this_id.original_data = self.data.loc[self.data['id'] == arb_id]
this_id.original_data = this_id.original_data.copy() # type: DataFrame
# Check if the Arbitration ID always used the same DLC. If not, ignore it.
# We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary.
if this_id.original_data['dlc'].nunique() is not 1:
continue
this_id.dlc = this_id.original_data['dlc'].iloc[0]
this_id.original_data.drop('dlc', axis=1, inplace=True)
this_id.original_data.drop('id', axis=1, inplace=True)
# If DLC < 8, we can automatically drop data column vectors > DLC.
# E.G. drop bytes "B7" and "B6" if DLC is 6; those are padding data injected by can-dump and were
# not actually on the bus.
if this_id.dlc < 8:
for i in range(this_id.dlc, 8):
this_id.original_data.drop('b' + str(i), axis=1, inplace=True)
# Check if there are duplicate index values and correct them.
if not this_id.original_data.index.is_unique:
correction_mask = this_id.original_data.index.duplicated()
this_id.original_data = this_id.original_data[~correction_mask]
this_id.generate_binary_matrix_and_tang(a_timer, normalize_strategy)
this_id.analyze_transmission_frequency(time_convert=time_conversion,
ci_accuracy=freq_analysis_accuracy,
synchronous_threshold=freq_synchronous_threshold)
id_dictionary[arb_id] = this_id
a_timer.set_arb_id_creation()
a_timer.set_raw_df_to_arb_id_dict()
return id_dictionary, j1979_dictionary

View File

@ -0,0 +1,305 @@
from PreProcessor import PreProcessor
from Validator import Validator
from LexicalAnalysis import tokenize_dictionary, generate_signals
from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling
from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram
from sklearn.preprocessing import minmax_scale
from typing import Callable
from PipelineTimer import PipelineTimer
from os import chdir, mkdir, path, remove
from pickle import dump, load
from numpy import ndarray, zeros, float16
from pandas import DataFrame
# File names for the on-disc data input and output.
output_folder: str = 'output'
pickle_arb_id_filename: str = 'pickleArbIDs.p'
pickle_threshold_filename: str = 'pickleLexThresholds.p'
pickle_j1979_filename: str = 'pickleJ1979.p'
pickle_signal_filename: str = 'pickleSignals.p'
pickle_subset_filename: str = 'pickleSubset.p'
csv_corr_matrix_filename: str = 'subset_correlation_matrix.csv'
pickle_j1979_corr_matrix: str = 'pickleJ1979_correlation.p'
pickle_clusters_filename: str = 'pickleClusters.p'
pickle_linkage_filename: str = 'pickleLinkage.p'
pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p'
csv_all_signals_filename: str = 'complete_correlation_matrix.csv'
pickle_timer_filename: str = 'pickleTimer.p'
dump_to_pickle: bool = True
# Change out the normalization strategies as needed.
tang_normalize_strategy: Callable = minmax_scale
signal_normalize_strategy: Callable = minmax_scale
# Turn on or off portions of the pipeline and output methods using these flags.
force_pre_processing: bool = False
force_threshold_search: bool = False
force_threshold_plotting: bool = False
force_j1979_plotting: bool = True
use_j1979: bool = True
force_lexical_analysis: bool = False
force_signal_generation: bool = False
force_arb_id_plotting: bool = True
force_correlation_matrix: bool = False
force_clustering: bool = False
force_signal_labeling: bool = False
use_j1979_tags_in_plots: bool = True
force_cluster_plotting: bool = True
force_dendrogram_plotting: bool = True
# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
time_conversion = 1000 # convert seconds to milliseconds
z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
freq_analysis_accuracy = z_lookup[0.9]
freq_synchronous_threshold = 0.1
# Threshold parameters used during lexical analysis.
tokenization_bit_distance: float = 0.2
tokenize_padding: bool = True
merge_tokens: bool = True
# Threshold parameters used during semantic analysis
subset_selection_size: float = 0.25
max_intra_cluster_distance: float = 0.20
min_j1979_correlation: float = 0.85
# fuzzy_labeling: bool = True
# A timer class to record timings throughout the pipeline.
a_timer = PipelineTimer(verbose=True)
class Sample:
def __init__(self, make: str, model: str, year: str, sample_index: str, sample_path: str, kfold_n: int):
# Sample Specific Meta-Data
self.make: str = make
self.model: str = model
self.year: str = year
self.path: str = sample_path
self.output_vehicle_dir: str = make + "_" + model + "_" + year
self.output_sample_dir: str = sample_index
# Pre-Processing Settings
self.use_j1979: bool = use_j1979
self.force_threshold_plot: bool = force_threshold_plotting
self.avg_score_matrix: ndarray = zeros((1, 1), dtype=float16)
# Lexical analysis settings
self.tang_inversion_bit_dist: float = tokenization_bit_distance
self.use_padding: bool = tokenize_padding
self.merge_tokens: bool = merge_tokens
self.token_merge_dist: float = tokenization_bit_distance
# Semantic analysis settings
self.max_inter_cluster_dist: float = max_intra_cluster_distance
# Various comparison testing methods are implemented in the Validator class
self.validator: Validator = Validator(use_j1979, kfold_n)
def make_and_move_to_vehicle_directory(self):
# This drills down three directories to './output/make_model_year/sample_index/' Make directories as needed
if not path.exists(output_folder):
mkdir(output_folder)
chdir(output_folder)
if not path.exists(self.output_vehicle_dir):
mkdir(self.output_vehicle_dir)
chdir(self.output_vehicle_dir)
if not path.exists(self.output_sample_dir):
mkdir(self.output_sample_dir)
chdir(self.output_sample_dir)
@staticmethod
def move_back_to_parent_directory():
# Move back to root of './output/make_model_year/sample_index/"
chdir("../../../")
def pre_process(self):
self.make_and_move_to_vehicle_directory()
pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979)
id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
tang_normalize_strategy,
time_conversion,
freq_analysis_accuracy,
freq_synchronous_threshold,
force_pre_processing)
if dump_to_pickle:
if force_pre_processing:
if path.isfile(pickle_arb_id_filename):
remove(pickle_arb_id_filename)
if path.isfile(pickle_j1979_filename):
remove(pickle_j1979_filename)
# Lexical analysis will add additional information to the Arb ID dict. Don't dump if you're going to
# immediately delete and replace pickle_arb_id_filename during Lexical Analysis.
if not force_lexical_analysis:
if not path.isfile(pickle_arb_id_filename) and id_dictionary:
print("\nDumping arb ID dictionary for " + self.output_vehicle_dir + " to " +
pickle_arb_id_filename)
dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_j1979_filename) and j1979_dictionary:
print("\nDumping J1979 dictionary for " + self.output_vehicle_dir + " to " + pickle_j1979_filename)
dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
return id_dictionary, j1979_dictionary
def plot_j1979(self, j1979_dictionary: dict, vehicle_number: str):
self.make_and_move_to_vehicle_directory()
plot_j1979(a_timer, j1979_dictionary, vehicle_number, force_j1979_plotting)
self.move_back_to_parent_directory()
def find_lex_thresholds(self, id_dict: dict):
self.make_and_move_to_vehicle_directory()
if path.isfile(pickle_threshold_filename):
if force_threshold_search:
# Remove any existing pickled threshold parameter search result matrix and create one.
remove(pickle_threshold_filename)
else:
print("\n\tLex threshold search already completed and forcing is turned off. Using pickled data...")
self.avg_score_matrix = load(open(pickle_threshold_filename, "rb"))
self.validator.set_lex_threshold_parameters(self)
# Move back to root of './output/make_model_year/sample_index/"
self.move_back_to_parent_directory()
return
self.validator.k_fold_lex_threshold_selection(id_dict=id_dict, sample=self)
if not path.isfile(pickle_threshold_filename):
print("\nDumping lexical analysis threshold search matrix for " + self.output_vehicle_dir +
" to " + pickle_threshold_filename)
dump(self.avg_score_matrix, open(pickle_threshold_filename, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
def tokenize_dictionary(self, id_dictionary: dict):
# Using force_pre_processing = True and force_lexical_analysis = False will cause the 2nd condition to trigger.
if force_lexical_analysis or not path.isfile(pickle_arb_id_filename):
tokenize_dictionary(a_timer=a_timer, d=id_dictionary, force=force_lexical_analysis,
include_padding=self.use_padding, merge=self.merge_tokens,
max_distance=self.tang_inversion_bit_dist)
if dump_to_pickle:
self.make_and_move_to_vehicle_directory()
if force_lexical_analysis:
if path.isfile(pickle_arb_id_filename):
remove(pickle_arb_id_filename)
if not path.isfile(pickle_arb_id_filename) and id_dictionary:
print("\nDumping arb ID dictionary for " + self.output_vehicle_dir + " to " + pickle_arb_id_filename)
dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
def generate_signals(self, id_dictionary: dict, postpone_pickle: bool = False):
self.make_and_move_to_vehicle_directory()
signal_dict = generate_signals(a_timer=a_timer,
arb_id_dict=id_dictionary,
signal_pickle_filename=pickle_signal_filename,
normalize_strategy=signal_normalize_strategy,
force=force_signal_generation)
# postpone_pickle is simply a check whether J1979 data was present in the sample. If it was present, then wait
# to save out the signal_dictionary until correlated Signals are labeled by sample.j1979_labeling().
if dump_to_pickle and not postpone_pickle and not path.isfile(pickle_signal_filename):
print("\nDumping signal dictionary for " + self.output_vehicle_dir + " to " + pickle_signal_filename)
dump(signal_dict, open(pickle_signal_filename, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
return signal_dict
def plot_arb_ids(self, id_dictionary: dict, signal_dictionary: dict, vehicle_number: str):
self.make_and_move_to_vehicle_directory()
plot_signals_by_arb_id(a_timer=a_timer,
arb_id_dict=id_dictionary,
signal_dict=signal_dictionary,
vehicle_number=vehicle_number,
force=force_arb_id_plotting)
self.move_back_to_parent_directory()
def generate_correlation_matrix(self, signal_dictionary: dict):
self.make_and_move_to_vehicle_directory()
if dump_to_pickle and force_correlation_matrix:
if path.isfile(csv_corr_matrix_filename):
remove(csv_corr_matrix_filename)
corr_matrix, combined_df = generate_correlation_matrix(a_timer=a_timer,
csv_signals_correlation_filename=csv_corr_matrix_filename,
combined_df_filename=pickle_combined_df_filename,
signal_dict=signal_dictionary,
force=force_correlation_matrix)
if not path.isfile(csv_corr_matrix_filename) and not corr_matrix.empty:
print("\nDumping subset correlation matrix for " + self.output_vehicle_dir + " to " +
csv_corr_matrix_filename)
corr_matrix.to_csv(csv_corr_matrix_filename)
print("\tComplete...")
if not path.isfile(pickle_combined_df_filename) and not combined_df.empty:
print("\nDumping combined signal DataFrame matrix for " + self.output_vehicle_dir + " to " +
pickle_combined_df_filename)
dump(combined_df, open(pickle_combined_df_filename, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
return corr_matrix, combined_df
def cluster_signals(self, corr_matrix: DataFrame):
self.make_and_move_to_vehicle_directory()
cluster_dict, linkage_matrix = signal_clustering(corr_matrix,
self.max_inter_cluster_dist,
pickle_clusters_filename,
pickle_linkage_filename,
force_clustering) # type: dict, ndarray
# Before we return or save the clusters, lets remove all singleton clusters. This serves as an implicit
# filtering technique for incorrectly tokenized signals.
list_to_remove = []
for k, cluster in cluster_dict.items():
if len(cluster) < 2:
list_to_remove.append(k)
for k in list_to_remove:
cluster_dict.pop(k, None)
if not path.isfile(pickle_clusters_filename) and cluster_dict:
print("\nDumping cluster dictionary to " + pickle_clusters_filename)
dump(cluster_dict, open(pickle_clusters_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_linkage_filename):
print("\nDumping agglomerative clustering linkage matrix to " + pickle_linkage_filename)
dump(linkage_matrix, open(pickle_linkage_filename, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
return cluster_dict, linkage_matrix
def j1979_labeling(self, j1979_dictionary: dict, signal_dictionary: dict, combined_df: DataFrame):
self.make_and_move_to_vehicle_directory()
signal_dictionary, j1979_correlation_matrix = j1979_signal_labeling(a_timer=a_timer,
j1979_corr_filename=pickle_j1979_corr_matrix,
signal_filename=pickle_signal_filename,
df_signals=combined_df,
j1979_dict=j1979_dictionary,
signal_dict=signal_dictionary,
correlation_threshold=min_j1979_correlation,
force=force_signal_generation)
# If the signal dictionary pickled data was deleted because j1979 tagging needed to happen, then save it with
# the tags added by the call to j1979_signal_labeling().
if not path.isfile(pickle_signal_filename) and signal_dictionary and j1979_dictionary:
print("\nDumping J1979 tagged signal dictionary to " + pickle_signal_filename)
dump(signal_dictionary, open(pickle_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_j1979_corr_matrix):
print("\nDumping j1979 signal correlation matrix to " + pickle_j1979_corr_matrix)
dump(j1979_correlation_matrix, open(pickle_j1979_corr_matrix, "wb"))
print("\tComplete...")
self.move_back_to_parent_directory()
return signal_dictionary, j1979_correlation_matrix
def plot_clusters(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool,
vehicle_number: str):
self.make_and_move_to_vehicle_directory()
plot_signals_by_cluster(a_timer=a_timer,
cluster_dict=cluster_dictionary,
signal_dict=signal_dictionary,
use_j1979_tags=use_j1979_tags,
vehicle_number=vehicle_number,
force=force_cluster_plotting)
self.move_back_to_parent_directory()
def plot_dendrogram(self, linkage_matrix: ndarray, vehicle_number: str):
self.make_and_move_to_vehicle_directory()
plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist,
vehicle_number=vehicle_number, force=force_dendrogram_plotting)
self.move_back_to_parent_directory()

View File

@ -0,0 +1,427 @@
from pandas import concat, DataFrame, read_csv
from numpy import ndarray, zeros
from os import path, remove
from pickle import load, dump
from ast import literal_eval
from J1979 import J1979
from Signal import Signal
from PipelineTimer import PipelineTimer
import scipy.spatial.distance as ssd
from scipy.cluster.hierarchy import linkage, fcluster
def generate_correlation_matrix(a_timer: PipelineTimer,
csv_signals_correlation_filename: str = '',
combined_df_filename: str = '',
signal_dict: dict = None,
force: bool = False):
if force:
if path.isfile(csv_signals_correlation_filename):
remove(csv_signals_correlation_filename)
if path.isfile(combined_df_filename):
remove(combined_df_filename)
if path.isfile(csv_signals_correlation_filename) and path.isfile(combined_df_filename) and not force:
print("\nA signal correlation matrix and combined matrix appears to exist and forcing is turned off. Using " +
csv_signals_correlation_filename + " and " + combined_df_filename)
# literal_eval converts the textual row/col tuple representation back to actual tuple data structures
return [read_csv(csv_signals_correlation_filename, index_col=0).rename(index=literal_eval, columns=literal_eval),
load(open(combined_df_filename, "rb"))]
non_static_signals_dict = {}
largest_index = []
df_columns = []
# Put all non-static signals into one DataFrame. Re-index all of them to share the same index.
for k_arb_id, arb_id_signals in signal_dict.items():
for k_signal_id, signal in arb_id_signals.items():
if not signal.static:
non_static_signals_dict[k_signal_id] = signal
df_columns.append(k_signal_id)
if signal.time_series.__len__() > largest_index.__len__():
largest_index = signal.time_series.index
df: DataFrame = DataFrame(zeros((largest_index.__len__(), df_columns.__len__())),
columns=df_columns,
index=largest_index)
for k_signal_id, signal in non_static_signals_dict.items():
df[k_signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
# Calculate the correlation matrix for this DataFrame of all non-static signals.
corr_matrix = df.corr()
# For some reason, despite no NaN and correct data types, the corr() method will return empty row/col for some
# signals. We're going to have to clean this up before clustering.
corr_matrix.dropna(axis=0, how='all', inplace=True)
corr_matrix.dropna(axis=1, how='all', inplace=True)
# Be sure to reflect any signal IDs dropped from the correlation matrix (due to empty row/col) in the combined DF
# as well.
return corr_matrix, df.loc[:, corr_matrix.columns.values]
def signal_clustering(corr_matrix: DataFrame,
threshold: float,
cluster_pickle: str = "",
linkage_pickle: str = "",
force: bool = False):
if force:
if path.isfile(cluster_pickle):
remove(cluster_pickle)
if path.isfile(linkage_pickle):
remove(linkage_pickle)
if path.isfile(cluster_pickle) and path.isfile(linkage_pickle):
print("\nSignal clustering already completed and forcing is turned off. Using pickled data...")
return [load(open(cluster_pickle, "rb")), load(open(linkage_pickle, "rb"))]
# Remove negative values from the correlation matrix and invert the values
corr_matrix.where(corr_matrix > 0, 0, inplace=True)
corr_matrix = 1 - corr_matrix
X = corr_matrix.values # type: ndarray
Y = ssd.squareform(X)
# Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
Z = linkage(Y, method='single', optimal_ordering=True)
fclus = fcluster(Z, t=threshold, criterion='distance')
cluster_dict = {}
for i, cluster_label in enumerate(fclus):
if cluster_label in cluster_dict:
cluster_dict[cluster_label].append(corr_matrix.index[i])
else:
cluster_dict[cluster_label] = [corr_matrix.index[i]]
return cluster_dict, Z
def subset_selection(a_timer: PipelineTimer,
signal_dict: dict = None,
subset_pickle: str = "",
force: bool = False,
subset_size: float = 0.25) -> DataFrame:
if path.isfile(subset_pickle):
if force:
# Remove any existing pickled Signal dictionary and create one.
remove(subset_pickle)
else:
print("\nSubset selection already completed and forcing is turned off. Using pickled data...")
return load(open(subset_pickle, "rb"))
a_timer.start_function_time()
signal_index = 0
for k_arb_id, arb_id_signals in signal_dict.items():
for k_signal_id, signal in arb_id_signals.items():
if not signal.static:
signal_index += 1
# setup subset selection data structure
df: DataFrame = DataFrame(zeros((signal_index, 4)),
columns=["arb_id", "start_index", "stop_index", "Shannon_Index"])
for i, (k_arb_id, arb_id_signals) in enumerate(signal_dict.items()):
for j, (k_signal_id, signal) in enumerate(arb_id_signals.items()):
if not signal.static:
df.iloc[signal_index-1] = [k_arb_id, signal.start_index, signal.stop_index, signal.shannon_index]
signal_index -= 1
# sort by Shannon Index
df.sort_values(by="Shannon_Index", inplace=True, ascending=False)
# Select subset with largest Shannon Index Values
df = df.head(int(round(df.__len__() * subset_size, 0)))
# In order to make an arb ID sorted output, sort this subset by arb_id
df.sort_values(by="arb_id", inplace=True)
# Re-index each Signal in the subset using the Signal with the most observed samples. Prepare to create a DataFrame
# that can be used for generating a correlation matrix.
subset = []
subset_cols = []
largest_index = []
for index, row in df.iterrows():
signal_id = (int(row[0]), int(row[1]), int(row[2]))
signal = signal_dict[row[0]][signal_id]
subset.append(signal)
subset_cols.append(signal_id)
if signal.time_series.__len__() > largest_index.__len__():
largest_index = signal.time_series.index
subset_df: DataFrame = DataFrame(zeros((largest_index.__len__(), subset.__len__())),
columns=subset_cols,
index=largest_index)
for signal in subset:
signal_id = (signal.arb_id, signal.start_index, signal.stop_index)
subset_df[signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
a_timer.set_subset_selection()
return subset_df
def subset_correlation(subset: DataFrame,
csv_correlation_filename: str,
force: bool = False) -> DataFrame:
if not force and path.isfile(csv_correlation_filename):
print("\nA subset correlation appears to exist and forcing is turned off. Using " + csv_correlation_filename)
# Read the .csv into a DataFrame. Also, we need to convert the columns and index from strings back to tuples.
# Pandas.read_csv brings the data in as a DataFrame. Pandas.DataFrame.rename converts the columns and index with
# ast.literal_eval. Literal_eval will convert a string representation of a tuple back to an actual tuple.
return read_csv(csv_correlation_filename, index_col=0).rename(index=literal_eval, columns=literal_eval)
else:
return subset.corr()
def greedy_signal_clustering(correlation_matrix: DataFrame = None,
correlation_threshold: float = 0.8,
fuzzy_labeling: bool = True) -> dict:
correlation_keys = correlation_matrix.columns.values
previously_clustered_signals = {}
cluster_dict = {}
new_cluster_label = 0
for n, row in enumerate(correlation_keys):
for m, col in enumerate(correlation_keys):
if n == m:
# this is a diagonal on the correlation matrix. Skip it.
continue
# I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
result = round(correlation_matrix.iloc[n, m], 2)
# check if this is a significant correlation according to our heuristic threshold.
if result >= correlation_threshold:
# Check if the current row signal is currently unlabeled
if row not in previously_clustered_signals.keys():
# Check if the current col signal is currently unlabeled
if col not in previously_clustered_signals.keys():
# Both signals are unlabeled. Create a new one.
cluster_dict[new_cluster_label] = [row, col]
previously_clustered_signals[row] = {new_cluster_label}
previously_clustered_signals[col] = {new_cluster_label}
# print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
new_cluster_label += 1
else:
# Row isn't labeled but col is; add row to all of col's clusters.
# print("adding", row, "to clusters", previously_clustered_signals[col])
# row is not already in a cluster, add it to col's set of clusters
for label in previously_clustered_signals[col]:
cluster_dict[label].append(row)
previously_clustered_signals[row] = previously_clustered_signals[col]
else:
# Check if the current col signal is currently unlabeled
if col not in previously_clustered_signals.keys():
# Row if labeled but col is not; add col to row's set of clusters
# print("adding", col, "to clusters", previously_clustered_signals[row])
for label in previously_clustered_signals[row]:
cluster_dict[label].append(col)
previously_clustered_signals[col] = previously_clustered_signals[row]
# Both signals are already labeled
else:
# Check if we're using fuzzy labeling (a signal can belong to multiple clusters).
# If so, check if the union of both sets of labels is the empty set. If so, this is a
# relationship that hasn't already been captures by an existing cluster. Make a new one.
if fuzzy_labeling:
row_label_set = previously_clustered_signals[row]
col_label_set = previously_clustered_signals[col]
if not row_label_set & col_label_set:
cluster_dict[new_cluster_label] = [row, col]
previously_clustered_signals[row] = {new_cluster_label} | row_label_set
previously_clustered_signals[col] = {new_cluster_label} | col_label_set
# print("created new cluster #", new_cluster_label, cluster_dict[new_cluster_label])
new_cluster_label += 1
else:
# We're using fuzzy labeling and these two signals represent a 'bridge' between two
# signal clusters. Fold col into row's clusters and delete col's unique cluster indices.
for label in row_label_set - col_label_set:
cluster_dict[label].append(col)
previously_clustered_signals[col] = row_label_set | col_label_set
for label in col_label_set - row_label_set:
cluster_dict[label].append(row)
previously_clustered_signals[row] = row_label_set | col_label_set
# print(row, col, "already in cluster_dict", previously_clustered_signals[row], "&",
# previously_clustered_signals[col])
# Delete any duplicate clusters
cluster_sets = []
deletion_labels = []
for label, cluster in cluster_dict.items():
this_set = set(cluster)
if this_set in cluster_sets:
deletion_labels.append(label)
else:
cluster_sets.append(this_set)
for label in deletion_labels:
del cluster_dict[label]
return cluster_dict
# NOTE: This method has a LOT of redundancy with the subset_selection, signal_correlation, and greedy_signal_clustering
# logic. If you know you always want to perform label propagation, it would be more efficient to incorporate it directly
# into those functions. Since this code base is more of a Proof of Concept, label propagation is deliberately pulled
# out as a distinct method to make the pipeline steps as distinct as possible.
def label_propagation(a_timer: PipelineTimer,
pickle_clusters_filename: str = '',
pickle_all_signals_df_filename: str = '',
csv_signals_correlation_filename: str = '',
signal_dict: dict = None,
cluster_dict: dict = None,
correlation_threshold: float = 0.8,
force: bool = False):
if path.isfile(pickle_all_signals_df_filename) and path.isfile(csv_signals_correlation_filename):
if force:
# Remove any existing data.
remove(pickle_all_signals_df_filename)
remove(csv_signals_correlation_filename)
remove(pickle_clusters_filename)
else:
print("\nA DataFrame and correlation matrix for label propagation appears to exist and forcing is turned "
"off. Using " + pickle_all_signals_df_filename + ", " + csv_signals_correlation_filename + ", and "
+ pickle_clusters_filename)
return [load(open(pickle_all_signals_df_filename, "rb")),
read_csv(csv_signals_correlation_filename, index_col=0).rename(index=literal_eval,
columns=literal_eval),
load(open(pickle_clusters_filename, "rb"))]
a_timer.start_function_time()
non_static_signals_dict = {}
largest_index = []
df_columns = []
# Put all non-static signals into one DataFrame. Re-index all of them to share the same index.
for k_arb_id, arb_id_signals in signal_dict.items():
for k_signal_id, signal in arb_id_signals.items():
if not signal.static:
non_static_signals_dict[k_signal_id] = signal
df_columns.append(k_signal_id)
if signal.time_series.__len__() > largest_index.__len__():
largest_index = signal.time_series.index
df: DataFrame = DataFrame(zeros((largest_index.__len__(), df_columns.__len__())),
columns=df_columns,
index=largest_index)
for k_signal_id, signal in non_static_signals_dict.items():
df[k_signal_id] = signal.time_series.reindex(index=largest_index, method='nearest')
# Calculate the correlation matrix for this DataFrame of all non-static signals.
correlation_matrix = df.corr()
# Re-run the algorithm from greedy_signal_clustering but omitting the logic for creating new clusters.
# This effectively propagates the labels generated by the subset of signals with the largest Shannon Index values
# to any correlated signals which were not part of that subset.
correlation_keys = correlation_matrix.columns.values
previously_clustered_signals = {}
for k_cluster_id, cluster in cluster_dict.items():
for k_signal_id in cluster:
previously_clustered_signals[k_signal_id] = k_cluster_id
for n, row in enumerate(correlation_keys):
for m, col in enumerate(correlation_keys):
if n == m:
# this is a diagonal on the correlation matrix. Skip it.
continue
# I chose to round here to allow relationships 'oh so close' to making it. No reason this HAS to be done.
result = round(correlation_matrix.iloc[n, m], 2)
# check if this is a significant correlation according to our heuristic threshold.
if result >= correlation_threshold:
# if row signal is already a member of a cluster
if row in previously_clustered_signals.keys():
# if col signal is already a member of a cluster
if col in previously_clustered_signals.keys():
# print(row, col, "already in clusters", previously_clustered_signals[row], "&",
# previously_clustered_signals[col])
continue
# if col is not already in a cluster, add it to row's cluster
else:
# print("adding", col, "to cluster", clusters[previously_clustered_signals[row]])
cluster_dict[previously_clustered_signals[row]].append(col)
previously_clustered_signals[col] = previously_clustered_signals[row]
# row signal hasn't been added to a cluster
else:
# if col signal is already a member of a cluster
if col in previously_clustered_signals.keys():
# print("adding", row, "to cluster", clusters[previously_clustered_signals[col]])
# row is not already in a cluster, add it to col's cluster
cluster_dict[previously_clustered_signals[col]].append(row)
previously_clustered_signals[row] = previously_clustered_signals[col]
a_timer.set_label_propagation()
df.dropna(axis=0, how='any', inplace=True)
df.dropna(axis=1, how='any', inplace=True)
return df, correlation_matrix, cluster_dict
def j1979_signal_labeling(a_timer: PipelineTimer,
j1979_corr_filename: str = "",
signal_filename: str = "",
df_signals: DataFrame = None,
j1979_dict: dict = None,
signal_dict: dict = None,
correlation_threshold: float = 0.8,
force: bool = False) -> [dict, DataFrame]:
if force:
if path.isfile(j1979_corr_filename):
remove(j1979_corr_filename)
if path.isfile(signal_filename):
remove(signal_filename)
if path.isfile(j1979_corr_filename):
print("\nA J1979 correlation matrix for signal labeling appears to exist and forcing is turned off. Using "
+ j1979_corr_filename + " and the signal dictionary passed to j1979_signal_labeling().")
return signal_dict, load(open(j1979_corr_filename, "rb"))
# If a signal dictionary has already been pickled, j1979 correlation has not been pickled, and there is J1979 data
# in this sample, this implies that the pickled signal dictionary does not include the appropriate J1979 tags on the
# Signal objects. Delete that pickled dictionary, tag Signals in signal_dict, and then re-pickle that dictionary in
# Sample.py.
elif path.isfile(signal_filename) and j1979_dict:
if path.isfile(signal_filename):
remove(signal_filename)
latest_start_index = 0.0
earliest_end_index = 99999999999999.9
df_columns = []
for pid, pid_data in j1979_dict.items(): # type: int, J1979
if latest_start_index < pid_data.data.index[0]:
latest_start_index = pid_data.data.index[0]
if earliest_end_index > pid_data.data.index[-1]:
earliest_end_index = pid_data.data.index[-1]
df_columns.append(pid_data.title)
df_signals = df_signals.loc[latest_start_index:earliest_end_index] # type: DataFrame
df_j1979: DataFrame = DataFrame(zeros((df_signals.shape[0], df_columns.__len__())),
columns=df_columns,
index=df_signals.index)
for pid, pid_data in j1979_dict.items(): # type: int, J1979
df_j1979[pid_data.title] = pid_data.data.reindex(index=df_signals.index, method='nearest')
df_combined = concat([df_signals, df_j1979], axis=1)
correlation_matrix = df_combined.corr()
correlation_matrix.dropna(axis=1, how='all', inplace=True)
correlation_matrix.dropna(axis=0, how='all', inplace=True)
# Just consider the J1979 column correlations. Slice off the identity rows at the end of the DataFrame as well.
for index, row in correlation_matrix[df_columns][:-len(df_columns)].iterrows():
row = abs(row)
max_index = row.idxmax(axis=1, skipna=True)
if row[max_index] >= correlation_threshold:
# Index is the tuple identifying the signal (Arb ID, Start Index, Stop Index); signal_dict keyed on Arb ID
signal = signal_dict[index[0]][index] # type: Signal
# print("Adding J1979 attribute " + max_index + " to signal ID " + str(index))
signal.j1979_title = max_index
signal.j1979_pcc = row[max_index]
signal_dict[index[0]][index] = signal
# print(i, index, row[max_index], max_index, row.values)
return signal_dict, correlation_matrix[df_columns][:-len(df_columns)]
# correlation_matrix.to_csv('j1979_correlation.csv')

View File

@ -0,0 +1,43 @@
from pandas import Series
from math import log10
class Signal:
def __init__(self, arb_id: int, start_index: int, stop_index: int):
self.arb_id: int = arb_id
self.start_index: int = start_index
self.stop_index: int = stop_index
self.time_series: Series = None
self.static: bool = True
self.shannon_index: float = 0
self.plot_title: str = ""
self.j1979_title: str = None
self.j1979_pcc: float = 0
def normalize_and_set_metadata(self, normalize_strategy):
self.set_shannon_index()
self.update_static()
self.set_plot_title()
self.normalize(normalize_strategy)
def set_shannon_index(self):
si: float = 0.0
n: int = self.time_series.__len__()
for count in self.time_series.value_counts():
# calculate proportion of this integer value in the total population of values
p_i = count / n
# calculate the Shannon Index (si) given p_i of this value.
si += p_i * log10(p_i)
si *= -1
self.shannon_index = si
def update_static(self):
if self.shannon_index >= .000001:
self.static = False
def set_plot_title(self):
self.plot_title = "Time Series from Bit Positions " + str(self.start_index) + " to " + str(self.stop_index) + \
" of Arb ID " + hex(int(self.arb_id))
def normalize(self, normalize_strategy):
normalize_strategy(self.time_series.values, copy=False)

View File

@ -0,0 +1,105 @@
from LexicalAnalysis import get_composition_just_tang, merge_tokens_just_composition
from sklearn.model_selection import KFold
from ArbID import ArbID
from numpy import arange, ndarray, zeros, float16, add, divide, argmax, unravel_index
# Threshold parameters used during tokenization.
tokenization_bit_distance: float = 0.2
tokenize_padding: bool = True
def alignment_score(mismatch: int, bit_width: int):
# Alignment Score: 1 - (union-intersection) / (n-1)
# The range of Alignment Score is 0 (all mismatch) to 1 (all match)
return float16(1 - mismatch / (bit_width - 1))
def borders(token: tuple, last_index: int):
if not token[0] == 0:
if not token[1] == last_index:
return [token[0]-1, token[1]]
else:
return [token[0]-1]
else:
if not token[1] == last_index:
return [token[1]]
return []
def train_test_alignment_score(tang_a: ndarray, tang_b: ndarray, max_inversion: float, max_merge: float):
# Note: padding is used as a dummy variable to maintain compatibility with the primary pipeline get_composition()
comp_a, padding = get_composition_just_tang(tang_a, include_padding=True, max_inversion_distance=max_inversion)
comp_a = merge_tokens_just_composition(tokens=comp_a, this_tang=tang_a, max_distance=max_merge)
comp_b, padding = get_composition_just_tang(tang_b, include_padding=True, max_inversion_distance=max_inversion)
comp_b = merge_tokens_just_composition(tokens=comp_b, this_tang=tang_b, max_distance=max_merge)
bit_width = len(tang_a)
id_a_borders = []
id_b_borders = []
for token in comp_a:
# print("\nToken:", token)
# result = borders(token, bit_width-1)
# print("\tBorders:", result)
id_a_borders.extend(borders(token, bit_width-1))
for token in comp_b:
# print("\nToken:", token)
# result = borders(token, bit_width - 1)
# print("\tBorders:", result)
id_b_borders.extend(borders(token, bit_width-1))
id_a_borders = set(id_a_borders)
id_b_borders = set(id_b_borders)
# Mismatch set is the symmetric difference: borders where there was a border in only one of the two payloads.
mismatch_set = id_a_borders.union(id_b_borders) - id_a_borders.intersection(id_b_borders)
return alignment_score(len(mismatch_set), bit_width)
class Validator:
def __init__(self,
use_j1979: bool = False,
fold_n: int = 5):
self.use_j1979: bool = use_j1979
self.fold_n: int = fold_n
@staticmethod
# This function allows for pickling just the score matrix and re-creating a Sample object using it later.
def set_lex_threshold_parameters(sample):
if sample.avg_score_matrix.shape[0] > 1:
optimal_setting = argmax(sample.avg_score_matrix)
optimal_setting = unravel_index(optimal_setting, sample.avg_score_matrix.shape)
sample.optimal_bit_dist = optimal_setting[0]
sample.optimal_merge_dist = optimal_setting[1]
else:
print("\nSet lex threshold parameters was improperly called for sample " + sample.output_vehicle_dir)
def k_fold_lex_threshold_selection(self, id_dict: dict, sample):
list_of_inversion_values = arange(0, 1.01, 0.01)
list_of_merge_values = arange(0, 1.01, 0.01)
sample.avg_score_matrix = zeros((len(list_of_inversion_values), len(list_of_merge_values)), dtype=float16)
number_of_ids_scored: int = 0
for id_label, arb_id in id_dict.items(): # type: int, ArbID
if arb_id.static or arb_id.short:
continue
this_id_avg_score_matrix = zeros((len(list_of_inversion_values), len(list_of_merge_values)), dtype=float16)
print("\tID:", id_label, "\tnumber of observed payloads:", arb_id.original_data.shape[0])
kf = KFold(n_splits=self.fold_n)
for k, (train, test) in enumerate(kf.split(arb_id.boolean_matrix)):
score_matrix = zeros((len(list_of_inversion_values), len(list_of_merge_values)), dtype=float16)
train_tang = arb_id.generate_tang(boolean_matrix=arb_id.boolean_matrix[train])
test_tang = arb_id.generate_tang(boolean_matrix=arb_id.boolean_matrix[test])
for m, i in enumerate(list_of_inversion_values):
for n, j in enumerate(list_of_merge_values):
score_matrix[m, n] = train_test_alignment_score(train_tang, test_tang, i, j)
this_id_avg_score_matrix = add(this_id_avg_score_matrix, score_matrix)
this_id_avg_score_matrix = divide(this_id_avg_score_matrix, self.fold_n)
sample.avg_score_matrix = add(sample.avg_score_matrix, this_id_avg_score_matrix)
number_of_ids_scored += 1
sample.avg_score_matrix = divide(sample.avg_score_matrix, number_of_ids_scored)
self.set_lex_threshold_parameters(sample)

View File

@ -0,0 +1,81 @@
from numpy import arange, ndarray, zeros, concatenate, uint64
from math import log10
from pandas import Series, concat
def shannon_index(X: Series):
H: float = 0.0
n: int = X.shape[0]
for count in X.value_counts():
# calculate proportion of this integer value in the total population of values
p_i = count / n
# calculate the Shannon Index (H) given p_i of this value.
H += p_i * log10(p_i)
H *= -1
return H
def make_binary_matrix(X: Series):
boolean_matrix = zeros((X.shape[0], 8), dtype=uint64)
for i, row in enumerate(X.iteritems()):
if row[1] > 0:
# i is the row in the boolean_matrix
# j*8 is the left hand bit for this byte in the payload
# j*8 + 8 is the right hand bit for this byte in the payload
# e.g. byte index 1 starts at bits 1*8 = 8 to 1*8+8 = 16; [8:16]
# likewise, byte index 7 starts at bits 7*8 = 56 to 7*8+8 = 64
# Numpy indexing is non-inclusive of the upper bound. So [0:8] is the first 8 elements
bin_string = format(row[1], '08b')
boolean_matrix[i, :] = [x == '1' for x in bin_string]
return boolean_matrix
def binary_to_int(X: ndarray, tokens: list):
signals = {}
for token in tokens:
# Convert the binary ndarray to a list of string representations of each row
temp1 = [''.join(str(x) for x in row) for row in X[:, token[0]:token[1] + 1]]
temp2 = zeros((temp1.__len__(), 1), dtype=uint64)
# convert each string representation to int
for i, row in enumerate(temp1):
temp2[i] = int(row, 2)
# create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
signal = Series(temp2[:, 0])
signals[token] = signal
return signals
x1 = Series(arange(0, 32, 1))
x1 = concat([x1, x1], axis=0, ignore_index=True)
x1 = concat([x1, x1], axis=0, ignore_index=True)
x1 = concat([x1, x1], axis=0, ignore_index=True)
# x1 = concat([Series(arange(0, 10, 1)), x1], axis=0, ignore_index=True)
# x1 = concat([x1, x1], axis=0, ignore_index=True)
x2 = Series(arange(0, 256, 1))
# print(x1)
# print(shannon_index(x1))
# print(shannon_index(x2))
x1_bin = make_binary_matrix(x1)
x2_bin = make_binary_matrix(x2)
# print(x1_bin)
# print(x2_bin.shape)
x12_bin = concatenate((x1_bin, x2_bin), axis=1)
lower = 0
upper = 15
cutoff = 0
sum_shannon = {}
while cutoff < upper:
d = binary_to_int(x12_bin, [(0, cutoff), (cutoff+1, 15)])
this_sum = 0
for k, v in d.items():
this_sum += shannon_index(v)
sum_shannon[cutoff] = this_sum
cutoff += 1
for k, v in sum_shannon.items():
print(k, v)

14
R/.Rhistory Normal file
View File

@ -0,0 +1,14 @@
demo()
demo(graphics)
library(rEDM)
devtools::install_github("ha0ye/rEDM")
library(devtools)
library(rEDM)
setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R")
speed_brake_rpm <- read.csv("speed_brake_rpm.csv")
ts_speed <- speed_brake_rpm$speed
lib_speed <- c(1, 0.5*length(ts_speed))
pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
plot(simplex_output_speed$tp, simplex_output_speed$rho, type = "l", xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)")

6
R/CCM_speed_brake_rpm.R Normal file
View File

@ -0,0 +1,6 @@
for (ccm_from in vars) {
for (ccm_to in vars[vars != ccm_from]) {
out_temp <- ccm(speed_brake_rpm, E = 9, lib_column = ccm_from, target_column = ccm_to, lib_sizes = n, replace = FALSE, silent = TRUE)
ccm_rho_matrix[ccm_from, ccm_to] <- out_temp$rho
}
}

17842
R/brake_data.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
R/brake_xmap_rpm.Rda Normal file

Binary file not shown.

BIN
R/brake_xmap_rpm_v2.Rda Normal file

Binary file not shown.

BIN
R/brake_xmap_speed.Rda Normal file

Binary file not shown.

BIN
R/brake_xmap_speed_v2.Rda Normal file

Binary file not shown.

167
R/city/.Rhistory Normal file
View File

@ -0,0 +1,167 @@
demo()
demo(graphics)
library(rEDM)
devtools::install_github("ha0ye/rEDM")
library(devtools)
libary(rEDM)
package(rEDM)
library(rEDM)
setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R/city")
speed_brake_rpm <- read.csv("speed_brake_rpm_city.csv")
load(smap_output_brake.RDA)
load(smap_output_brake.Rda)
load(file = "smap_output_brake.Rda")
View(smap_output_brake)
View(speed_brake_rpm)
lib <- c(1, 0.5*length(speed_brake_rpm))
pred <- c(0.5*length(speed_brake_rpm)+1, length(speed_brake_rpm))
len(speed_brake_rpm)
data(speed_brake_rpm)
length(speed_brake_rpm)
lib <- c(1, 0.5*22792)
pred <- c(0.5*22792+1, 22792)
cols <- c("speed", "rpm", "brake")
target <- "brake"
block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
save(block_smap_output, file = "brake_block_smap_output.Rda")
smap_coeffs <- block_smap_output$smap_coefficients[[1]]
str(smap_coeffs)
predictions <- block_smap_output$model_output[[1]] t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
plot(t, smap_coeffs[, 3], type = "l", col = "magenta", ylab = "effect of lags", xlab = "")
predictions <- block_smap_output$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
plot(t, smap_coeffs[, 3], type = "l", col = "magenta", ylab = "effect of lags", xlab = "")
cols = c("brake", "speed", "rpm")
block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
save(block_smap_output, file = "brake_block_smap_output.Rda")
smap_coeffs <- block_smap_output$smap_coefficients[[1]]
str(smap_coeffs)
predictions <- block_smap_output$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of lags", xlab = "")
plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of speed", xlab = "")
plot(t, smap_coeffs[, 3], type = "l", col = "magenta", ylab = "effect of rpm", xlab = "")
cols = c("speed", "rpm")
block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
smap_coeffs <- block_smap_output$smap_coefficients[[1]]
str(smap_coeffs)
predictions <- block_smap_output$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
list_of_model_predictions <- block_smap_output$model_output
first_data_frame_of_predictions <- list_of_model_predictions[[1]]
observed <- first_data_frame_of_predictions$obs
predicted <- first_data_frame_of_predictions$pred
plot_range <- range(c(observed, predicted), na.rm = TRUE)
plot(observed, predicted, xlim = plot_range, ylim = plot_range, xlab = "Observed", ylab = "Predicted", asp = 1)
abline(a = 0, b = 1, lty = 2, col = "blue")
View(block_smap_output)
View(block_smap_output)
View(first_data_frame_of_predictions)
View(first_data_frame_of_predictions)
View(block_smap_output)
str(block_smap_output.rmse)
str(block_smap_output$rmse)
data(block_3sp)
View(block_3sp)
block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", tp = 300, theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
smap_coeffs <- block_smap_output$smap_coefficients[[1]]
str(smap_coeffs)
predictions <- block_smap_output$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
View(block_smap_output)
list_of_model_predictions <- block_smap_output$model_output
first_data_frame_of_predictions <- list_of_model_predictions[[1]]
observed <- first_data_frame_of_predictions$obs
predicted <- first_data_frame_of_predictions$pred
plot_range <- range(c(observed, predicted), na.rm = TRUE)
plot(observed, predicted, xlim = plot_range, ylim = plot_range, xlab = "Observed", ylab = "Predicted", asp = 1)
abline(a = 0, b = 1, lty = 2, col = "blue")
load(file = "ccm_optimal_tp.Rda")
View(output_df)
View(output_df)
ccm_speed_xmap_brake_neg400 = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = "speed", target_column = "brake", tp = -400, random_libs = FALSE, num_neighbors = "E+1", silent = TRUE, stats_only=FALSE, first_column_time = TRUE)
ccm_speed_xmap_brake_neg400 = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = "speed", target_column = "brake", tp = -400, random_libs = FALSE, num_neighbors = "E+1", silent = TRUE, first_column_time = TRUE)
View(ccm_speed_xmap_brake_neg400)
View(smap_output_brake)
output_df_means <- ccm_means(output_df)
warnings()
View(output_df_means)
output_brake <- output_df[11]
View(output_brake)
output_brake <- output_df[[11]]
View(output_df)
output_brake <- ccm_means(output_df[11])
output_brake <- ccm_means(output_df[[11]])
output_df[[11]]
output_df[11]
output_df[11,]
output_brake <- ccm_means(output_df[11,])
output_brake <- output_df[11,]
View(output_brake)
output_brake <- output_brake[,-1]
output_brake <- output_df[11,]
output_brake <- output_brake[,-1]
output_brake <- output_df[11,]
output_brake <- output_brake[,-12]
output_brake <- ccm_means(output_brake)
View(output_brake)
View(ccm_speed_xmap_brake_neg400)
output_brake <- ccm_means(ccm_speed_xmap_brake_neg400)
View(output_brake)
View(ccm_speed_xmap_brake_neg400)
brake_observed <- compute_stats
View(brake_observed)
compute_stats(ccm_speed_xmap_brake_neg400)
compute_stats(speed_brake_rpm$brake, ccm_speed_xmap_brake_neg400)
View(ccm_speed_xmap_brake_neg400)
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", P = 0.5, E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", P = 0.5, E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE, k = "all")
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", P = 0.5, E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = 1, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
View(speed_brake_rpm)
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = 1, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, first_column_time = TRUE, silent = TRUE)
brake_multiview <- multiview(speed_brake_rpm, target_column = "brake", stats_only = FALSE, first_column_time = TRUE, silent = TRUE)
View(brake_multiview)
predictions <- brake_multiview$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
save(brake_multiview, file = "brake_multiview.Rda")
View(brake_multiview)
brake_multiview <- multiview(speed_brake_rpm, lib = c(1, floor(NROW(speed_brake_rpm)/2)), pred = c(floor(NROW(speed_brake_rpm)/2) + 1, NROW(speed_brake_rpm)), norm_type = "L2 norm", E = 9, tau = 1, tp = -300, max_lag = 3, num_neighbors = "e+1", k = "sqrt", na.rm = FALSE, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, exclusion_radius = NULL, silent = TRUE)
brake_multiview <- multiview(speed_brake_rpm, E = 9, tau = 1, tp = -300, max_lag = 3, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, silent = TRUE)
brake_multiview <- multiview(speed_brake_rpm, tau = 1, tp = -300, max_lag = 3, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, silent = TRUE)
View(brake_multiview)
load(file = "brake_multiview.Rda")
multiview_output <- brake_multiview$model_output
View(multiview_output)
multiview_output <- multiview_output[1]
View(multiview_output)
multiview_output <- multiview_output[[1]]
View(multiview_output)
write.csv(multiview_output, file = "multiview_brake.csv")

Binary file not shown.

22792
R/city/brake_data_city.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
R/city/brake_multiview.Rda Normal file

Binary file not shown.

BIN
R/city/brake_xmap_rpm.Rda Normal file

Binary file not shown.

BIN
R/city/brake_xmap_speed.Rda Normal file

Binary file not shown.

BIN
R/city/ccm_one_row_city.pdf Normal file

Binary file not shown.

BIN
R/city/ccm_optimal_tp.Rda Normal file

Binary file not shown.

BIN
R/city/ccm_tp.pdf Normal file

Binary file not shown.

View File

@ -0,0 +1,317 @@
par() features
mar
A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1
oma
A vector of the form c(bottom, left, top, right) giving the size of the outer margins in lines of text.
mgp
The margin line (in mex units) for the axis title, axis labels and axis line. Note that mgp[1] affects title whereas mgp[2:3] affect axis. The default is c(3, 1, 0).
library(rEDM)
setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R/city")
speed_brake_rpm <- read.csv("speed_brake_rpm_city.csv")
# CDN: setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R/city")
===============================================================================================================================================================================
1. Perform Simplex Projection to determine best embedding dimension
===============================================================================================================================================================================
ts_speed <- speed_brake_rpm$speed
lib_speed <- c(1, 0.5*length(ts_speed))
pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
ts_rpm <- speed_brake_rpm$rpm
lib_rpm <- c(1, 0.5*length(ts_rpm))
pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
ts_brake <- speed_brake_rpm$brake
lib_brake <- c(1, 0.5*length(ts_brake))
pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
save(simplex_output_speed, file = "simplex_output_speed.Rda")
save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
save(simplex_output_brake, file = "simplex_output_brake.Rda")
# If instead you're loading previous calculations....
load(file = "simplex_output_speed.Rda")
load(file = "simplex_output_rpm.Rda")
load(file = "simplex_output_brake.Rda")
===============================================================================================================================================================================
1a. Plot and Save the Embedding Results
===============================================================================================================================================================================
par(mar = c(4, 4, 1, 1), mgp = c(2.5, 1, 0)) # set margins for plotting
# create a 1-row, 3-column plot matrix (pg. 260 of R-cookbook)
par(mfrow=c(1,3), cex=1.5)
plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", xlab = "Embedding Dimension (E)", ylab = expression(paste("Forecast Skill (", rho, ")")), main="Vehicle Speed")
plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", xlab = "Embedding Dimension (E)", main="Engine RPM")
plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", xlab = "Embedding Dimension (E)", main="Brake Pressure")
# IF YOU WANT TO PLOT ALL THREE ON ONE LINE DIRECTLY TO A 5.5" x 2.5# PDF
# mgp affects (axis label offset, axis number offeset, axis line offset). Resize the window in RStudio in order to resize the plot output when using the Export button.
pdf(file = "simplex_embedding_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0.99, 1.0), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.99, 0.995, 1.0), labels = c(0.99, 0.995, 1.0))
plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", ylim = c(0.997, 1.0), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.997, 0.9985, 1.0), labels = c(0.997, 0.9985, 1.0))
plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0.5, 0.9), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.5, 0.7, 0.9), labels = c(0.5, 0.7, 0.9))
title(xlab = "Embedding Dimension (E)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
===============================================================================================================================================================================
2. Re-run Simplex Projection with Optimal E to determine whether chaotic noise appears (Rho drops quickly with respect to increased tp)
===============================================================================================================================================================================
# Be sure to reload simplex_output_X using the E=9 computations before the next round of plots
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, E = 9, tp = 1:1000)
save(simplex_output_rpm, file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, E = 9, tp = 1:1000)
save(simplex_output_brake, file = "simplex_output_brake_E_9_tp_1-1000.Rda")
# If instead you're loading previous calculations....
load(file = "simplex_output_speed_E_9_tp_1-1000.Rda")
load(file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
load(file = "simplex_output_brake_E_9_tp_1-1000.Rda")
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "simplex_forecasting_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(simplex_output_speed$tp, pmax(0, simplex_output_speed$rho), type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0, 1.0))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
plot(simplex_output_rpm$tp, pmax(0, simplex_output_rpm$rho), type = "l", axes = FALSE, main="Engine RPM", ylim = c(0, 1.0))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
plot(simplex_output_brake$tp, pmax(0, simplex_output_brake$rho), type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0, 1.0))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
title(xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
===============================================================================================================================================================================
2. Distinguish between red noise and nonlinear deterministic behavior using S-maps
===============================================================================================================================================================================
smap_output_speed <- s_map(ts_speed, lib_speed, pred_speed, E = 9, silent = TRUE)
smap_output_rpm <- s_map(ts_rpm, lib_rpm, pred_rpm, E = 9, silent = TRUE)
smap_output_brake <- s_map(ts_brake, lib_brake, pred_brake, E = 9, silent = TRUE)
save(smap_output_speed, file = "smap_output_speed.Rda")
save(smap_output_rpm, file = "smap_output_rpm.Rda")
save(smap_output_brake, file = "smap_output_brake.Rda")
load(file = "smap_output_rpm.Rda")
load(file = "smap_output_speed.Rda")
load(file = "smap_output_brake.Rda")
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "smap_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", xlim=c(0,8), ylim = c(0.99997, 0.999975))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.99997,0.999975), labels = c(0.99997,0.999975))
plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", xlim=c(0,8), ylim = c(0.999931, 0.999937))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.999931,0.999937), labels = c(0.999931,0.999937))
plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", xlim=c(0,8), ylim = c(0.986, 0.9885))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.986, 0.986, 0.9885), labels = c(0.986, 0.986, 0.9885))
title(xlab = "Nonlinear Tuning Parameter (theta)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# To do individual plots
plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999912,0.9999931))
title(main = "S-Map Forecast Skill For Vehicle Speed")
plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.99693,0.99696))
title(main = "S-Map Forecast Skill For Brake Pressure")
plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999842,0.9999857))
title(main = "S-Map Forecast Skill For Engine Rotations Per Minute")
===============================================================================================================================================================================
3. Perform Convergent Cross Mapping Between Variables Using Optimal Embedding Dimension E
===============================================================================================================================================================================
# num_samples = 10, lib_sizes = seq(0, 15000, by = 3000) causes 10 models at each of the 5 library sizes
speed_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_rpm, file = "speed_xmap_rpm.Rda")
speed_xmap_brake <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_brake, file = "speed_xmap_brake.Rda")
brake_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_rpm, file = "brake_xmap_rpm.Rda")
brake_xmap_speed <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_speed, file = "brake_xmap_speed.Rda")
rpm_xmap_speed <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_speed, file = "rpm_xmap_speed.Rda")
rpm_xmap_brake <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_brake, file = "rpm_xmap_brake.Rda")
# X xmap Y: causal effect of (pred) Y onto (lib) X
load(file = "speed_xmap_rpm.Rda")
load(file = "speed_xmap_brake.Rda")
load(file = "brake_xmap_rpm.Rda")
load(file = "brake_xmap_speed.Rda")
load(file = "rpm_xmap_speed.Rda")
load(file = "rpm_xmap_brake.Rda")
speed_xmap_rpm_mean <- ccm_means(speed_xmap_rpm)
speed_xmap_brake_mean <- ccm_means(speed_xmap_brake)
brake_xmap_rpm_mean <- ccm_means(brake_xmap_rpm)
brake_xmap_speed_mean <- ccm_means(brake_xmap_speed)
rpm_xmap_speed_mean <- ccm_means(rpm_xmap_speed)
rpm_xmap_brake_mean <- ccm_means(rpm_xmap_brake)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "ccm_one_row_city.pdf", width = 5.5, height = 2.0, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & RPM", xlim = c(3000, 15000), ylim = c(.5, .8))
lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
legend(x = "right", legend = c("S xmap R", "R xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
axis(side = 2, labels = TRUE)
plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Brake & RPM", xlim = c(3000, 15000), ylim = c(.1, .3))
lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
legend(x = "topright", legend = c("B xmap R", "R xmap B"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
axis(side = 2, labels = TRUE)
plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & Brake", xlim = c(3000, 15000), ylim = c(.45, .6))
lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
legend(x = "topright", legend = c("S xmap B", "B xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
axis(side = 2, labels = TRUE)
title(xlab = "Library Size", ylab = "Cross Map Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.86, 0.95))
lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
legend(x = "topleft", legend = c("Speed xmap Engine RPM", "Engine RPM xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Speed and Engine RPM")
plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.35, 0.48))
lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
legend(x = "left", legend = c("Brake Pressure xmap Engine RPM", "Engine RPM xmap Brake Pressure"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Brake Pressure and Engine RPM")
plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.6, 0.9))
lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
legend(x = "left", legend = c("Speed xmap Brake Pressure", "Brake Pressure xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Speed and Brake Pressure")
===============================================================================================================================================================================
Extension 1. Look for optimal TP value
===============================================================================================================================================================================
vars_city <- names(speed_brake_rpm)[2:4]
tp_list = seq(-500, 500, 100)
params_city <- expand.grid(lib_column = vars_city, target_column = vars_city, tp = tp_list)
params_city <- params_city[params_city$lib_column != params_city$target_column, ]
params_city$E <- 9
output <- list()
step_size = 10
for (n in 0:floor(NROW(params_city)/step_size)) {
gc()
for (i in (n*step_size+1):(min((n+1)*step_size, NROW(params_city)))) {
output[[i]] = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = params_city$lib_column[i], target_column = params_city$target_column[i], tp = params_city$tp[i], random_libs = FALSE, num_neighbors = "E+1", silent = TRUE)
}
}
library(data.table)
output_df = rbindlist(output)
output_df$direction <- paste(output_df$lib_column, "xmap to\n", output_df$target_column)
save(output_df, file = "ccm_optimal_tp.Rda")
library(ggplot2)
time_delay_ccm_fig <- ggplot(output_df, aes(x = tp, y = rho, color = direction)) + geom_line() + theme_bw() + scale_x_continuous(breaks=seq(-500, 500, 100))
print(time_delay_ccm_fig)
ggsave(filename = "ccm_tp.pdf", plot = time_delay_ccm_fig, device = "pdf", width = 5.5, height = 2.5, units = "in", dpi = "print")
===============================================================================================================================================================================
Extension 2. Model Brake using S-map of Brake, Speed, and RPM
===============================================================================================================================================================================
lib <- c(1, 0.5*22792)
pred <- c(0.5*22792+1, 22792)
cols <- c("speed", "rpm", "brake")
target <- "brake"
block_smap_output <- block_lnlp(speed_brake_rpm, lib = lib, pred = pred, columns = cols, target_column = target, method = "s-map", theta = 3, stats_only = FALSE, first_column_time = TRUE, save_smap_coefficients = TRUE, silent = TRUE)
save(block_smap_output, file = "brake_block_smap_output.Rda")
smap_coeffs <- block_smap_output$smap_coefficients[[1]]
str(smap_coeffs)
predictions <- block_smap_output$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")
plot(t, smap_coeffs[, 1], type = "l", col = "red", ylab = "effect of speed", xlab = "")
plot(t, smap_coeffs[, 2], type = "l", col = "blue", ylab = "effect of rpm", xlab = "")
list_of_model_predictions <- block_smap_output$model_output
first_data_frame_of_predictions <- list_of_model_predictions[[1]]
observed <- first_data_frame_of_predictions$obs
predicted <- first_data_frame_of_predictions$pred
plot_range <- range(c(observed, predicted), na.rm = TRUE)
plot(observed, predicted, xlim = plot_range, ylim = plot_range, xlab = "Observed", ylab = "Predicted", asp = 1)
abline(a = 0, b = 1, lty = 2, col = "blue")
===============================================================================================================================================================================
Extension 3. Model Brake using CCM from Speed
===============================================================================================================================================================================
ccm_speed_xmap_brake_neg400 = ccm(speed_brake_rpm, E = 9, lib_sizes = NROW(speed_brake_rpm), lib_column = "speed", target_column = "brake", tp = -400, random_libs = FALSE, num_neighbors = "E+1", silent = TRUE, stats_only=FALSE, first_column_time = TRUE)
brake_multiview <- multiview(speed_brake_rpm, E = 9, tau = 1, tp = -300, max_lag = 3, target_column = "brake", stats_only = FALSE, save_lagged_block = TRUE, first_column_time = TRUE, silent = TRUE)
predictions <- brake_multiview$model_output[[1]]
t <- predictions$time
plot(t, predictions$obs, type = "l", col = "black", ylab = "x", xlab = "")
lines(t, predictions$pred, lty = 2)
legend("topright", legend = c("observed", "predicted"), lty = c(1, 2), bty = "n")

BIN
R/city/last.dump.rda Normal file

Binary file not shown.

11397
R/city/multiview_brake.csv Normal file

File diff suppressed because it is too large Load Diff

22792
R/city/rpm_data_city.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
R/city/rpm_xmap_brake.Rda Normal file

Binary file not shown.

BIN
R/city/rpm_xmap_speed.Rda Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
R/city/smap_output_rpm.Rda Normal file

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

22792
R/city/speed_data_city.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
R/city/speed_xmap_brake.Rda Normal file

Binary file not shown.

BIN
R/city/speed_xmap_rpm.Rda Normal file

Binary file not shown.

BIN
R/city/xmap_tp.pdf Normal file

Binary file not shown.

234
R/commands_list.txt Normal file
View File

@ -0,0 +1,234 @@
par() features
mar
A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1
oma
A vector of the form c(bottom, left, top, right) giving the size of the outer margins in lines of text.
mgp
The margin line (in mex units) for the axis title, axis labels and axis line. Note that mgp[1] affects title whereas mgp[2:3] affect axis. The default is c(3, 1, 0).
library(rEDM)
setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R")
# CDN: setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R")
speed_brake_rpm <- read.csv("speed_brake_rpm.csv")
===============================================================================================================================================================================
1. Perform Simplex Projection to determine best embedding dimension
===============================================================================================================================================================================
ts_speed <- speed_brake_rpm$speed
lib_speed <- c(1, 0.5*length(ts_speed))
pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
ts_rpm <- speed_brake_rpm$rpm
lib_rpm <- c(1, 0.5*length(ts_rpm))
pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
ts_brake <- speed_brake_rpm$brake
lib_brake <- c(1, 0.5*length(ts_brake))
pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
save(simplex_output_speed, file = "simplex_output_speed.Rda")
save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
save(simplex_output_brake, file = "simplex_output_brake.Rda")
# If instead you're loading previous calculations....
load(file = "simplex_output_speed.Rda")
load(file = "simplex_output_rpm.Rda")
load(file = "simplex_output_brake.Rda")
===============================================================================================================================================================================
1a. Plot and Save the Embedding Results
===============================================================================================================================================================================
par(mar = c(4, 4, 1, 1), mgp = c(2.5, 1, 0)) # set margins for plotting
# create a 1-row, 3-column plot matrix (pg. 260 of R-cookbook)
par(mfrow=c(1,3), cex=1.5)
plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", xlab = "Embedding Dimension (E)", ylab = expression(paste("Forecast Skill (", rho, ")")), main="Vehicle Speed")
plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", xlab = "Embedding Dimension (E)", main="Engine RPM")
plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", xlab = "Embedding Dimension (E)", main="Brake Pressure")
# IF YOU WANT TO PLOT ALL THREE ON ONE LINE DIRECTLY TO A 5.5" x 2.5# PDF
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(5,4,2,1) + 0.1, mar = c(0,0,1,2) + 0.1, mgp=c(2, 1, 0))
# mgp affects (axis label offset, axis number offeset, axis line offset). Resize the window in RStudio in order to resize the plot output when using the Export button.
pdf(file = "simplex_embedding_one_row.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0.9992, 1.0), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, labels = TRUE)
plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", ylim = c(0.9975, 1.0), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, labels = TRUE)
plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0.97, 0.985), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, labels = TRUE)
title(xlab = "Embedding Dimension (E)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
===============================================================================================================================================================================
2. Re-run Simplex Projection with Optimal E to determine whether chaotic noise appears (Rho drops quickly with respect to increased tp)
===============================================================================================================================================================================
# Be sure to reload simplex_output_X using the E=9 computations before the next round of plots
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, E = 9, tp = 1:1000)
save(simplex_output_rpm, file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, E = 9, tp = 1:1000)
save(simplex_output_brake, file = "simplex_output_brake_E_9_tp_1-1000.Rda")
# If instead you're loading previous calculations....
load(file = "simplex_output_speed_E_9_tp_1-1000.Rda")
load(file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
load(file = "simplex_output_brake_E_9_tp_1-1000.Rda")
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "simplex_forecasting_one_row.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(simplex_output_speed$tp, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed")
axis(side = 1, labels = TRUE)
axis(side = 2, labels = TRUE)
plot(simplex_output_rpm$tp, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM")
axis(side = 1, labels = TRUE)
axis(side = 2, labels = TRUE)
plot(simplex_output_brake$tp, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure")
axis(side = 1, labels = TRUE)
axis(side = 2, labels = TRUE)
title(xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
===============================================================================================================================================================================
2. Distinguish between red noise and nonlinear deterministic behavior using S-maps
===============================================================================================================================================================================
smap_output_speed <- s_map(ts_speed, lib_speed, pred_speed, E = 9, silent = TRUE)
smap_output_rpm <- s_map(ts_rpm, lib_rpm, pred_rpm, E = 9, silent = TRUE)
smap_output_brake <- s_map(ts_brake, lib_brake, pred_brake, E = 9, silent = TRUE)
load(file = "smap_output_rpm.Rda")
load(file = "smap_output_speed.Rda")
load(file = "smap_output_brake.Rda")
smap_output_speed = smap_output[[1]]
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "smap_one_row.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", xlim=c(0,8), ylim = c(0.9999912,0.9999931))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.9999912,0.9999931), labels = c(0.999991,0.999993))
plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", xlim=c(0,8), ylim = c(0.9999842,0.9999857))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.9999842,0.9999857), labels = c(0.999984,0.999985))
plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", xlim=c(0,8), ylim = c(0.99693,0.99696))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.99693,0.99696), labels = c(0.99693,0.99696))
title(xlab = "Nonlinear Tuning Parameter (theta)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# To do individual plots
plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999912,0.9999931))
title(main = "S-Map Forecast Skill For Vehicle Speed")
plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.99693,0.99696))
title(main = "S-Map Forecast Skill For Brake Pressure")
plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999842,0.9999857))
title(main = "S-Map Forecast Skill For Engine Rotations Per Minute")
===============================================================================================================================================================================
3. Perform Convergent Cross Mapping Between Variables Using Optimal Embedding Dimension E
===============================================================================================================================================================================
# num_samples = 10, lib_sizes = seq(0, 15000, by = 3000) causes 10 models at each of the 5 library sizes
speed_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_rpm, file = "speed_xmap_rpm_v2.Rda")
speed_xmap_brake <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_brake, file = "speed_xmap_brake_v2.Rda")
brake_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_rpm, file = "brake_xmap_rpm_v2.Rda")
brake_xmap_speed <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_speed, file = "brake_xmap_speed_v2.Rda")
rpm_xmap_speed <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_speed, file = "rpm_xmap_speed_v2.Rda")
rpm_xmap_brake <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_brake, file = "rpm_xmap_brake_v2.Rda")
# X xmap Y: causal effect of (pred) Y onto (lib) X
load(file = "speed_xmap_rpm.Rda")
load(file = "speed_xmap_brake.Rda")
load(file = "brake_xmap_rpm.Rda")
load(file = "brake_xmap_speed.Rda")
load(file = "rpm_xmap_speed.Rda")
load(file = "rpm_xmap_brake.Rda")
speed_xmap_rpm_mean <- ccm_means(speed_xmap_rpm)
speed_xmap_brake_mean <- ccm_means(speed_xmap_brake)
brake_xmap_rpm_mean <- ccm_means(brake_xmap_rpm)
brake_xmap_speed_mean <- ccm_means(brake_xmap_speed)
rpm_xmap_speed_mean <- ccm_means(rpm_xmap_speed)
rpm_xmap_brake_mean <- ccm_means(rpm_xmap_brake)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "ccm_one_row.pdf", width = 5.5, height = 2.0, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & RPM", xlim = c(5000, 15000), ylim = c(0.86, 0.95))
lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
legend(x = "bottomright", legend = c("S xmap R", "R xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(5000, 10000, 15000), labels = c("5k", "10k", "15k"))
axis(side = 2, at = c(0.86, 0.89, 0.92, 0.95), labels = c(0.86, 0.89, 0.92, 0.95))
plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Brake & RPM", xlim = c(5000, 15000), ylim = c(0.35, 0.49))
lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
legend(x = "right", legend = c("B xmap R", "R xmap B"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(5000, 10000, 15000), labels = c("5k", "10k", "15k"))
axis(side = 2, at = c(0.35, 0.40, 0.45, 0.49), labels = c(0.35, 0.40, 0.45, 0.49))
plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & Brake", xlim = c(5000, 15000), ylim = c(0.6, 0.9))
lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
legend(x = "right", legend = c("S xmap B", "B xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(5000, 10000, 15000), labels = c("5k", "10k", "15k"))
axis(side = 2, at = c(0.6, 0.7, 0.8, 0.9), labels = c(0.6, 0.7, 0.8, 0.9))
title(xlab = "Library Size", ylab = "Cross Map Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.86, 0.95))
lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
legend(x = "topleft", legend = c("Speed xmap Engine RPM", "Engine RPM xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Speed and Engine RPM")
plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.35, 0.48))
lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
legend(x = "left", legend = c("Brake Pressure xmap Engine RPM", "Engine RPM xmap Brake Pressure"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Brake Pressure and Engine RPM")
plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.6, 0.9))
lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
legend(x = "left", legend = c("Speed xmap Brake Pressure", "Brake Pressure xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Speed and Brake Pressure")

View File

@ -0,0 +1,7 @@
for (ccm_from in vars) {
for (ccm_to in vars[vars != ccm_from]) {
cf_temp <- ccf(speed_brake_rpm[, ccm_from], speed_brake_rpm[, ccm_to], type = "correlation", lag.max = 500, plot = FALSE)$acf
corr_matrix[ccm_from, ccm_to] <- max(abs(cf_temp))
}
}

25
R/home/.Rhistory Normal file
View File

@ -0,0 +1,25 @@
library(rEDM)
setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R/home")
speed_brake_rpm_throttle <- read.csv("speed_brake_rpm_throttle_home.csv")
View(speed_brake_rpm_throttle)
View(speed_brake_rpm_throttle)
ts_speed <- speed_brake_rpm_throttle$speed
lib_speed <- c(1, 0.5*length(ts_speed))
pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
ts_rpm <- speed_brake_rpm_throttle$rpm
lib_rpm <- c(1, 0.5*length(ts_rpm))
pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
ts_brake <- speed_brake_rpm_throttle$brake
lib_brake <- c(1, 0.5*length(ts_brake))
pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
ts_throttle <- speed_brake_rpm_throttle$throttle
lib_throttle <- c(1, 0.5*length(ts_throttle))
pred_throttle <- c(0.5*length(ts_throttle)+1, length(ts_throttle))
simplex_output_throttle <- simplex(ts_throttle, lib_throttle, pred_throttle, silent = TRUE)
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
save(simplex_output_throttle, file = "simplex_output_throttle.Rda")
save(simplex_output_speed, file = "simplex_output_speed.Rda")
save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
save(simplex_output_brake, file = "simplex_output_brake.Rda")

View File

@ -0,0 +1,266 @@
par() features
mar
A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1
oma
A vector of the form c(bottom, left, top, right) giving the size of the outer margins in lines of text.
mgp
The margin line (in mex units) for the axis title, axis labels and axis line. Note that mgp[1] affects title whereas mgp[2:3] affect axis. The default is c(3, 1, 0).
library(rEDM)
setwd("C:/Users/brent/Google Drive/AFIT/PhD Research/Validation/R/home")
# CDN: setwd("C:/Users/bnolan/Google Drive/AFIT/PhD Research/Validation/R/home")
speed_brake_rpm_throttle <- read.csv("speed_brake_rpm_throttle_home.csv")
===============================================================================================================================================================================
1. Perform Simplex Projection to determine best embedding dimension
===============================================================================================================================================================================
ts_speed <- speed_brake_rpm_throttle$speed
lib_speed <- c(1, 0.5*length(ts_speed))
pred_speed <- c(0.5*length(ts_speed)+1, length(ts_speed))
ts_rpm <- speed_brake_rpm_throttle$rpm
lib_rpm <- c(1, 0.5*length(ts_rpm))
pred_rpm <- c(0.5*length(ts_rpm)+1, length(ts_rpm))
ts_brake <- speed_brake_rpm_throttle$brake
lib_brake <- c(1, 0.5*length(ts_brake))
pred_brake <- c(0.5*length(ts_brake)+1, length(ts_brake))
ts_throttle <- speed_brake_rpm_throttle$throttle
lib_throttle <- c(1, 0.5*length(ts_throttle))
pred_throttle <- c(0.5*length(ts_throttle)+1, length(ts_throttle))
simplex_output_throttle <- simplex(ts_throttle, lib_throttle, pred_throttle, silent = TRUE)
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, silent = TRUE)
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, silent = TRUE)
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, silent = TRUE)
save(simplex_output_throttle, file = "simplex_output_throttle.Rda")
save(simplex_output_speed, file = "simplex_output_speed.Rda")
save(simplex_output_rpm, file = "simplex_output_rpm.Rda")
save(simplex_output_brake, file = "simplex_output_brake.Rda")
# If instead you're loading previous calculations....
load(file = "simplex_output_speed.Rda")
load(file = "simplex_output_rpm.Rda")
load(file = "simplex_output_brake.Rda")
load(file = "simplex_output_throttle.Rda")
# If you're running into std::bad_alloc memory issues, try using 1/2 of the data....
ts_speed <- speed_brake_rpm_throttle$speed
lib_speed <- c(1, 0.1*length(ts_speed))
pred_speed <- c(0.1*length(ts_speed)+1, 0.2*length(ts_speed))
ts_rpm <- speed_brake_rpm_throttle$rpm
lib_rpm <- c(1, 0.1*length(ts_rpm))
pred_rpm <- c(0.1*length(ts_rpm)+1, 0.2*length(ts_rpm))
ts_brake <- speed_brake_rpm_throttle$brake
lib_brake <- c(1, 0.1*length(ts_brake))
pred_brake <- c(0.1*length(ts_brake)+1, 0.2*length(ts_brake))
ts_throttle <- speed_brake_rpm_throttle$throttle
lib_throttle <- c(1, 0.1*length(ts_throttle))
pred_throttle <- c(0.1*length(ts_throttle)+1, 0.2*length(ts_throttle))
===============================================================================================================================================================================
1a. Plot and Save the Embedding Results
===============================================================================================================================================================================
par(mar = c(4, 4, 1, 1), mgp = c(2.5, 1, 0)) # set margins for plotting
# create a 1-row, 3-column plot matrix (pg. 260 of R-cookbook)
par(mfrow=c(1,3), cex=1.5)
plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", xlab = "Embedding Dimension (E)", ylab = expression(paste("Forecast Skill (", rho, ")")), main="Vehicle Speed")
plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", xlab = "Embedding Dimension (E)", main="Engine RPM")
plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", xlab = "Embedding Dimension (E)", main="Brake Pressure")
# IF YOU WANT TO PLOT ALL THREE ON ONE LINE DIRECTLY TO A 5.5" x 2.5# PDF
# mgp affects (axis label offset, axis number offeset, axis line offset). Resize the window in RStudio in order to resize the plot output when using the Export button.
pdf(file = "simplex_embedding_one_row_home.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,4), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(simplex_output_speed$E, simplex_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0.9995, 1.0), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.9995, 1.0), labels = c(0.9995, 1.0))
plot(simplex_output_rpm$E, simplex_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", ylim = c(0.9995, 1.0), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.9995, 1.0), labels = c(0.9995, 1.0))
plot(simplex_output_brake$E, simplex_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0.95, 1), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.95, 1), labels = c(0.95, 1))
plot(simplex_output_throttle$E, simplex_output_throttle$rho, type = "l", axes = FALSE, main="Throttle Pressure", ylim = c(0.992, 1), xlim = c(0, 10))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.992, 1), labels = c(0.992, 1))
title(xlab = "Embedding Dimension (E)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
===============================================================================================================================================================================
2. Re-run Simplex Projection with Optimal E to determine whether chaotic noise appears (Rho drops quickly with respect to increased tp)
===============================================================================================================================================================================
# Be sure to reload simplex_output_X using the E=9 computations before the next round of plots
simplex_output_speed <- simplex(ts_speed, lib_speed, pred_speed, E = 9, tp = 1:1000)
save(simplex_output_speed, file = "simplex_output_speed_E_9_tp_1-1000.Rda")
simplex_output_rpm <- simplex(ts_rpm, lib_rpm, pred_rpm, E = 9, tp = 1:1000)
save(simplex_output_rpm, file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
simplex_output_brake <- simplex(ts_brake, lib_brake, pred_brake, E = 9, tp = 1:1000)
save(simplex_output_brake, file = "simplex_output_brake_E_9_tp_1-1000.Rda")
# If instead you're loading previous calculations....
load(file = "simplex_output_speed_E_9_tp_1-1000.Rda")
load(file = "simplex_output_rpm_E_9_tp_1-1000.Rda")
load(file = "simplex_output_brake_E_9_tp_1-1000.Rda")
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "simplex_forecasting_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,0,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(simplex_output_speed$tp, pmax(0, simplex_output_speed$rho), type = "l", axes = FALSE, main="Vehicle Speed", ylim = c(0, 1.0))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
plot(simplex_output_rpm$tp, pmax(0, simplex_output_rpm$rho), type = "l", axes = FALSE, main="Engine RPM", ylim = c(0, 1.0))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
plot(simplex_output_brake$tp, pmax(0, simplex_output_brake$rho), type = "l", axes = FALSE, main="Brake Pressure", ylim = c(0, 1.0))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0, 0.5, 1.0), labels = c(0, 0.5, 1.0))
title(xlab = "Time to Prediction (tp)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
===============================================================================================================================================================================
2. Distinguish between red noise and nonlinear deterministic behavior using S-maps
===============================================================================================================================================================================
smap_output_speed <- s_map(ts_speed, lib_speed, pred_speed, E = 9, silent = TRUE)
save(smap_output_speed, file = "smap_output_speed.Rda")
smap_output_rpm <- s_map(ts_rpm, lib_rpm, pred_rpm, E = 9, silent = TRUE)
save(smap_output_rpm, file = "smap_output_rpm.Rda")
smap_output_brake <- s_map(ts_brake, lib_brake, pred_brake, E = 9, silent = TRUE)
save(smap_output_brake, file = "smap_output_brake.Rda")
smap_output_throttle <- s_map(ts_throttle, lib_throttle, pred_throttle, E = 9, silent = TRUE)
save(smap_output_throttle, file = "smap_output_throttle.Rda")
load(file = "smap_output_rpm.Rda")
load(file = "smap_output_speed.Rda")
load(file = "smap_output_brake.Rda")
load(file = "smap_output_throttle.Rda")
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "smap_one_row_city.pdf", width = 5.5, height = 1.5, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", axes = FALSE, main="Vehicle Speed", xlim=c(0,8), ylim = c(0.99997, 0.999975))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.99997,0.999975), labels = c(0.99997,0.999975))
plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", axes = FALSE, main="Engine RPM", xlim=c(0,8), ylim = c(0.999931, 0.999937))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.999931,0.999937), labels = c(0.999931,0.999937))
plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", axes = FALSE, main="Brake Pressure", xlim=c(0,8), ylim = c(0.986, 0.9885))
axis(side = 1, labels = TRUE)
axis(side = 2, at = c(0.986, 0.986, 0.9885), labels = c(0.986, 0.986, 0.9885))
title(xlab = "Nonlinear Tuning Parameter (theta)", ylab = "Forecast Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# To do individual plots
plot(smap_output_speed$theta, smap_output_speed$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999912,0.9999931))
title(main = "S-Map Forecast Skill For Vehicle Speed")
plot(smap_output_brake$theta, smap_output_brake$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.99693,0.99696))
title(main = "S-Map Forecast Skill For Brake Pressure")
plot(smap_output_rpm$theta, smap_output_rpm$rho, type = "l", xlab = "Nonlinearity (theta)", ylab = expression(paste("Forecast Skill (", rho, ")")), xlim=c(0,8), ylim=c(0.9999842,0.9999857))
title(main = "S-Map Forecast Skill For Engine Rotations Per Minute")
===============================================================================================================================================================================
3. Perform Convergent Cross Mapping Between Variables Using Optimal Embedding Dimension E
===============================================================================================================================================================================
# num_samples = 10, lib_sizes = seq(0, 15000, by = 3000) causes 10 models at each of the 5 library sizes
speed_xmap_rpm <- ccm(speed_brake_rpm_throttle, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_rpm, file = "speed_xmap_rpm.Rda")
speed_xmap_brake <- ccm(speed_brake_rpm_throttle, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_brake, file = "speed_xmap_brake.Rda")
brake_xmap_rpm <- ccm(speed_brake_rpm_throttle, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_rpm, file = "brake_xmap_rpm.Rda")
brake_xmap_speed <- ccm(speed_brake_rpm_throttle, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_speed, file = "brake_xmap_speed.Rda")
rpm_xmap_speed <- ccm(speed_brake_rpm_throttle, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_speed, file = "rpm_xmap_speed.Rda")
rpm_xmap_brake <- ccm(speed_brake_rpm_throttle, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 90000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_brake, file = "rpm_xmap_brake.Rda")
# X xmap Y: causal effect of (pred) Y onto (lib) X
load(file = "speed_xmap_rpm.Rda")
load(file = "speed_xmap_brake.Rda")
load(file = "brake_xmap_rpm.Rda")
load(file = "brake_xmap_speed.Rda")
load(file = "rpm_xmap_speed.Rda")
load(file = "rpm_xmap_brake.Rda")
speed_xmap_rpm_mean <- ccm_means(speed_xmap_rpm)
speed_xmap_brake_mean <- ccm_means(speed_xmap_brake)
brake_xmap_rpm_mean <- ccm_means(brake_xmap_rpm)
brake_xmap_speed_mean <- ccm_means(brake_xmap_speed)
rpm_xmap_speed_mean <- ccm_means(rpm_xmap_speed)
rpm_xmap_brake_mean <- ccm_means(rpm_xmap_brake)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pdf(file = "ccm_one_row_city.pdf", width = 5.5, height = 2.0, family = "Helvetica")
op <- par(mfrow = c(1,3), cex.axis=1, cex.lab=1, oma = c(3,3,1,0) + 0.1, mar = c(0,1,1,1) + 0.1, mgp=c(2, 1, 0))
plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & RPM", xlim = c(3000, 15000), ylim = c(.5, .8))
lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
legend(x = "right", legend = c("S xmap R", "R xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
axis(side = 2, labels = TRUE)
plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", axes = FALSE, main="Brake & RPM", xlim = c(3000, 15000), ylim = c(.1, .3))
lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
legend(x = "topright", legend = c("B xmap R", "R xmap B"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
axis(side = 2, labels = TRUE)
plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", axes = FALSE, main="Speed & Brake", xlim = c(3000, 15000), ylim = c(.45, .6))
lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
legend(x = "topright", legend = c("S xmap B", "B xmap S"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 1)
axis(side = 1, at = c(3000, 9000, 15000), labels = c("3k", "9k", "15k"))
axis(side = 2, labels = TRUE)
title(xlab = "Library Size", ylab = "Cross Map Skill (rho)", outer = TRUE)
par(op)
dev.off()
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
plot(speed_xmap_rpm_mean$lib_size, pmax(0, speed_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.86, 0.95))
lines(rpm_xmap_speed_mean$lib_size, pmax(0, rpm_xmap_speed_mean$rho), col = "blue")
legend(x = "topleft", legend = c("Speed xmap Engine RPM", "Engine RPM xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Speed and Engine RPM")
plot(brake_xmap_rpm_mean$lib_size, pmax(0, brake_xmap_rpm_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.35, 0.48))
lines(rpm_xmap_brake_mean$lib_size, pmax(0, rpm_xmap_brake_mean$rho), col = "blue")
legend(x = "left", legend = c("Brake Pressure xmap Engine RPM", "Engine RPM xmap Brake Pressure"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Brake Pressure and Engine RPM")
plot(speed_xmap_brake_mean$lib_size, pmax(0, speed_xmap_brake_mean$rho), type = "l", col = "red", xlab = "Library Size", ylab = expression(paste("Cross Map Skill (", rho, ")")), ylim = c(0.6, 0.9))
lines(brake_xmap_speed_mean$lib_size, pmax(0, brake_xmap_speed_mean$rho), col = "blue")
legend(x = "left", legend = c("Speed xmap Brake Pressure", "Brake Pressure xmap Speed"), col = c("red", "blue"), lwd = 1, bty = "n", inset = 0.02, cex = 1, y.intersp = 2)
title(main = "Cross Map Skill Between Speed and Brake Pressure")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

12
R/load_xmap_Rda.R Normal file
View File

@ -0,0 +1,12 @@
load(file = "rpm_xmap_speed.Rda")
load(file = "rpm_xmap_brake.Rda")
load(file = "brake_xmap_speed.Rda")
load(file = "brake_xmap_rpm.Rda")
load(file = "speed_xmap_rpm.Rda")
load(file = "speed_xmap_brake.Rda")
ccm_out_1 <- ccm_means(speed_xmap_rpm)
ccm_out_2 <- ccm_means(speed_xmap_brake)
ccm_out_3 <- ccm_means(brake_xmap_rpm)
ccm_out_4 <- ccm_means(brake_xmap_speed)
ccm_out_5 <- ccm_means(rpm_xmap_speed)
ccm_out_6 <- ccm_means(rpm_xmap_brake)

1165
R/plant_data.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
plot.new()
x <- ccm_out_2$lib_size
y_rpm_xmap_brake <- ccm_out_6$rho
y_speed_xmap_brake <- ccm_out_2$rho
plot(5,
5,
xlim=c(5000, 15000),
xlab = "Library Size",
ylim = c(.4,.9),
ylab = "Forcast Skill (rho)")
lines(x, y_rpm_xmap_brake, col="red", lwd=2)
lines(x, y_speed_xmap_brake, col="blue", lwd=2)
# Add a title
title("Causal Relationships Onto Brake Pressure")
# Add legend to plot
legend("topright",
inset=.2,
cex = 1,
#title="Legend",
c("RPM Xmap Brake","Speed Xmap Brake"),
horiz=FALSE,
lty=c(1,1),
lwd=c(2,2),
col=c("red","blue"),
bg="grey96")

31
R/plot_causal_onto_rpm.R Normal file
View File

@ -0,0 +1,31 @@
plot.new()
x <- ccm_out_3$lib_size
y_brake_xmap_rpm <- ccm_out_3$rho
y_speed_xmap_rpm <- ccm_out_1$rho
plot(5,
5,
xlim=c(5000, 15000),
xlab = "Library Size",
ylim = c(.3,1),
ylab = "Forcast Skill (rho)")
lines(x, y_brake_xmap_rpm, col="red", lwd=2)
lines(x, y_speed_xmap_rpm, col="blue", lwd=2)
# Add a title
title("Causal Relationships Onto Engine RPM")
# Add legend to plot
legend("topright",
inset=.2,
cex = 1,
#title="Legend",
c("Brake Xmap RPM","Speed Xmap RPM"),
horiz=FALSE,
lty=c(1,1),
lwd=c(2,2),
col=c("red","blue"),
bg="grey96")

View File

@ -0,0 +1,31 @@
plot.new()
x <- ccm_out_4$lib_size
y_brake_xmap_speed <- ccm_out_4$rho
y_rpm_xmap_speed <- ccm_out_5$rho
plot(5,
5,
xlim=c(5000, 15000),
xlab = "Library Size",
ylim = c(.6,.9),
ylab = "Forcast Skill (rho)")
lines(x, y_brake_xmap_speed, col="red", lwd=2)
lines(x, y_rpm_xmap_speed, col="blue", lwd=2)
# Add a title
title("Causal Relationships Onto Speed")
# Add legend to plot
legend("topright",
inset=.1,
cex = 1,
#title="Legend",
c("Brake Xmap Speed","RPM Xmap Speed"),
horiz=FALSE,
lty=c(1,1),
lwd=c(2,2),
col=c("red","blue"),
bg="grey96")

17842
R/rpm_data.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
R/rpm_xmap_brake Normal file

Binary file not shown.

BIN
R/rpm_xmap_brake.Rda Normal file

Binary file not shown.

BIN
R/rpm_xmap_brake_v2.Rda Normal file

Binary file not shown.

BIN
R/rpm_xmap_speed.Rda Normal file

Binary file not shown.

BIN
R/rpm_xmap_speed_v2.Rda Normal file

Binary file not shown.

14
R/run_all_xmap.R Normal file
View File

@ -0,0 +1,14 @@
speed_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_rpm, file = "speed_xmap_rpm_v2.Rda")
speed_xmap_brake <- ccm(speed_brake_rpm, lib_column = "speed", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(speed_xmap_brake, file = "speed_xmap_brake_v2.Rda")
brake_xmap_rpm <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "rpm", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_rpm, file = "brake_xmap_rpm_v2.Rda")
brake_xmap_speed <- ccm(speed_brake_rpm, lib_column = "brake", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(brake_xmap_speed, file = "brake_xmap_speed_v2.Rda")
rpm_xmap_speed <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "speed", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_speed, file = "rpm_xmap_speed_v2.Rda")
rpm_xmap_brake <- ccm(speed_brake_rpm, lib_column = "rpm", target_column = "brake", E = 9, lib_sizes = seq(0, 15000, by = 3000), num_samples = 10, random_libs = TRUE, replace = TRUE, num_neighbors = "E+1", tp = 0, silent = TRUE)
save(rpm_xmap_brake, file = "rpm_xmap_brake_v2.Rda")

BIN
R/simplex_output_brake.Rda Normal file

Binary file not shown.

BIN
R/simplex_output_brake.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
R/simplex_output_rpm.Rda Normal file

Binary file not shown.

BIN
R/simplex_output_rpm.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More