First Commit. Details to follow; I'm still learning.

This commit is contained in:
JoshuaArking 2019-08-27 13:35:21 -04:00 committed by JoshuaArking
parent ca184a56de
commit 3749d4f303
29 changed files with 608 additions and 197 deletions

2
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/Pipeline_multi-file" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/Example_Pipeline_Output" />
<excludeFolder url="file://$MODULE_DIR$/Pipeline" />
<excludeFolder url="file://$MODULE_DIR$/R" />
</content>
<orderEntry type="jdk" jdkName="Python 3.7 (CAN_Reverse_Engineering)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (CAN_Reverse_Engineering)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/CAN_Reverse_Engineering_git.iml" filepath="$PROJECT_DIR$/.idea/CAN_Reverse_Engineering_git.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

0
Pipeline/ArbID.py Normal file → Executable file
View File

0
Pipeline/J1979.py Normal file → Executable file
View File

0
Pipeline/LexicalAnalysis.py Normal file → Executable file
View File

356
Pipeline/Main.py Normal file → Executable file
View File

@ -9,190 +9,202 @@ from SemanticAnalysis import subset_selection, subset_correlation, greedy_signal
from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster
from PipelineTimer import PipelineTimer
i = 0
j = 0
# File names for the on-disc data input and output.
# Input:
can_data_filename: str = 'drive_runway_afit.log'
# can_data_filename: str = 'loggerProgram0.log'
#can_data_filename: str = 'drive_runway_afit.log'
can_data_filename: str = 'loggerProgram0.log'
# Output:
output_folder: str = 'output'
pickle_arb_id_filename: str = 'pickleArbIDs.p'
pickle_j1979_filename: str = 'pickleJ1979.p'
pickle_signal_filename: str = 'pickleSignals.p'
pickle_subset_filename: str = 'pickleSubset.p'
csv_correlation_filename: str = 'subset_correlation_matrix.csv'
pickle_j1979_correlation: str = 'pickleJ1979_correlation.p'
pickle_clusters_filename: str = 'pickleClusters.p'
pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
csv_all_signals_filename: str = 'complete_correlation_matrix.csv'
pickle_timer_filename: str = 'pickleTimer.p'
while i < 51:
if i == 50 and j < 50: #i need to optimize this and redesign it
j += 1
i = 0
elif i == 50 and j == 50:
i = 51
else:
i += 1
# Output:
output_folder: str = 'output'
pickle_arb_id_filename: str = 'pickleArbIDs.p'
pickle_j1979_filename: str = 'pickleJ1979.p'
pickle_signal_filename: str = 'pickleSignals.p'
pickle_subset_filename: str = 'pickleSubset.p'
csv_correlation_filename: str = 'subset_correlation_matrix.csv'
pickle_j1979_correlation: str = 'pickleJ1979_correlation.p'
pickle_clusters_filename: str = 'pickleClusters.p'
pickle_all_signal_filename: str = 'pickleAllSignalsDataFrame.p'
csv_all_signals_filename: str = 'complete_correlation_matrix.csv'
pickle_timer_filename: str = 'pickleTimer.p'
# Change out the normalization strategies as needed.
tang_normalize_strategy: Callable = minmax_scale
signal_normalize_strategy: Callable = minmax_scale
# Change out the normalization strategies as needed.
tang_normalize_strategy: Callable = minmax_scale
signal_normalize_strategy: Callable = minmax_scale
# Turn on or off portions of the pipeline and output methods using these flags.
force_pre_processing: bool = False
force_j1979_plotting: bool = False
# Turn on or off portions of the pipeline and output methods using these flags.
force_pre_processing: bool = False
force_j1979_plotting: bool = True
force_lexical_analysis: bool = False
force_arb_id_plotting: bool = True
force_lexical_analysis: bool = True
force_arb_id_plotting: bool = True
force_semantic_analysis: bool = False
force_signal_labeling: bool = False
use_j1979_tags_in_plots: bool = True
force_cluster_plotting: bool = False
force_semantic_analysis: bool = True
force_signal_labeling: bool = True
use_j1979_tags_in_plots: bool = True
force_cluster_plotting: bool = True
dump_to_pickle: bool = True
dump_to_pickle: bool = True
# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
time_conversion = 1000 # convert seconds to milliseconds
z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
freq_analysis_accuracy = z_lookup[0.9]
freq_synchronous_threshold = 0.1
# Parameters and threshold used for Arb ID transmission frequency analysis during Pre-processing.
time_conversion = 1000 # convert seconds to milliseconds
z_lookup = {.8: 1.28, .9: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
freq_analysis_accuracy = z_lookup[0.9]
freq_synchronous_threshold = 0.1
# Threshold parameters used during lexical analysis.
tokenization_bit_distance: float = 0.2
tokenize_padding: bool = True
# Threshold parameters used during semantic analysis
subset_selection_size: float = 0.25
fuzzy_labeling: bool = True
min_correlation_threshold: float = 0.85
# A timer class to record timings throughout the pipeline.
a_timer = PipelineTimer(verbose=True)
# DATA IMPORT AND PRE-PROCESSING #
pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
tang_normalize_strategy,
time_conversion,
freq_analysis_accuracy,
freq_synchronous_threshold,
force_pre_processing)
if j1979_dictionary:
plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
# Threshold parameters used during lexical analysis. Default is 0.2
tokenization_bit_distance: float = i/100
tokenize_padding: bool = True
# LEXICAL ANALYSIS #
print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
tokenize_dictionary(a_timer,
id_dictionary,
force_lexical_analysis,
include_padding=tokenize_padding,
merge=True,
max_distance=tokenization_bit_distance)
signal_dictionary = generate_signals(a_timer,
id_dictionary,
pickle_signal_filename,
signal_normalize_strategy,
force_lexical_analysis)
plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, force_arb_id_plotting)
# SEMANTIC ANALYSIS #
print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
subset_df = subset_selection(a_timer,
signal_dictionary,
pickle_subset_filename,
force_semantic_analysis,
subset_size=subset_selection_size)
corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
cluster_dict = greedy_signal_clustering(corr_matrix_subset,
correlation_threshold=min_correlation_threshold,
fuzzy_labeling=fuzzy_labeling)
df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
pickle_clusters_filename=pickle_clusters_filename,
pickle_all_signals_df_filename=pickle_all_signal_filename,
csv_signals_correlation_filename=csv_all_signals_filename,
signal_dict=signal_dictionary,
cluster_dict=cluster_dict,
correlation_threshold=min_correlation_threshold,
force=force_semantic_analysis)
signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
j1979_corr_filename=pickle_j1979_correlation,
df_signals=df_full,
j1979_dict=j1979_dictionary,
signal_dict=signal_dictionary,
correlation_threshold=min_correlation_threshold,
force=force_signal_labeling)
plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, force_cluster_plotting)
# Threshold parameters used during semantic analysis Default is 0.25 and 0.85
subset_selection_size: float = j/100
fuzzy_labeling: bool = True
min_correlation_threshold: float = 0.85
# DATA STORAGE #
if dump_to_pickle:
if force_pre_processing:
if path.isfile(pickle_arb_id_filename):
remove(pickle_arb_id_filename)
if path.isfile(pickle_j1979_filename):
remove(pickle_j1979_filename)
if force_lexical_analysis or force_signal_labeling:
if path.isfile(pickle_signal_filename):
remove(pickle_signal_filename)
if force_semantic_analysis:
if path.isfile(pickle_subset_filename):
remove(pickle_subset_filename)
if path.isfile(csv_correlation_filename):
remove(csv_correlation_filename)
if path.isfile(pickle_j1979_correlation):
remove(pickle_j1979_correlation)
if path.isfile(pickle_clusters_filename):
remove(pickle_clusters_filename)
if path.isfile(pickle_all_signal_filename):
remove(pickle_all_signal_filename)
if path.isfile(csv_all_signals_filename):
remove(csv_all_signals_filename)
# A timer class to record timings throughout the pipeline.
a_timer = PipelineTimer(verbose=True)
timer_flag = 0
if not path.exists(output_folder):
mkdir(output_folder)
chdir(output_folder)
if not path.isfile(pickle_arb_id_filename):
timer_flag += 1
print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_j1979_filename):
timer_flag += 1
print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_signal_filename):
timer_flag += 1
print("\nDumping signal dictionary to " + pickle_signal_filename)
dump(signal_dictionary, open(pickle_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_subset_filename):
timer_flag += 1
print("\nDumping signal subset list to " + pickle_subset_filename)
dump(subset_df, open(pickle_subset_filename, "wb"))
print("\tComplete...")
if not path.isfile(csv_correlation_filename):
timer_flag += 1
print("\nDumping subset correlation matrix to " + csv_correlation_filename)
corr_matrix_subset.to_csv(csv_correlation_filename)
print("\tComplete...")
if not path.isfile(pickle_j1979_correlation):
timer_flag += 1
print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
print("\tComplete...")
if not path.isfile(pickle_clusters_filename):
timer_flag += 1
print("\nDumping cluster dictionary to " + pickle_clusters_filename)
dump(cluster_dict, open(pickle_clusters_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_all_signal_filename):
timer_flag += 1
print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
dump(df_full, open(pickle_all_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(csv_all_signals_filename):
timer_flag += 1
print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
corr_matrix_full.to_csv(csv_all_signals_filename)
print("\tComplete...")
if timer_flag is 9:
print("\nDumping pipeline timer to " + pickle_timer_filename)
dump(a_timer, open(pickle_timer_filename, "wb"))
print("\tComplete...")
chdir("..")
# DATA IMPORT AND PRE-PROCESSING #
pre_processor = PreProcessor(can_data_filename, pickle_arb_id_filename, pickle_j1979_filename)
id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
tang_normalize_strategy,
time_conversion,
freq_analysis_accuracy,
freq_synchronous_threshold,
force_pre_processing)
if j1979_dictionary:
plot_j1979(a_timer, j1979_dictionary, force_j1979_plotting)
# LEXICAL ANALYSIS #
print("\n\t\t\t##### BEGINNING LEXICAL ANALYSIS #####")
tokenize_dictionary(a_timer,
id_dictionary,
force_lexical_analysis,
include_padding=tokenize_padding,
merge=True,
max_distance=tokenization_bit_distance)
signal_dictionary = generate_signals(a_timer,
id_dictionary,
pickle_signal_filename,
signal_normalize_strategy,
force_lexical_analysis)
plot_signals_by_arb_id(a_timer, id_dictionary, signal_dictionary, i, force_arb_id_plotting)
# SEMANTIC ANALYSIS #
print("\n\t\t\t##### BEGINNING SEMANTIC ANALYSIS #####")
subset_df = subset_selection(a_timer,
signal_dictionary,
pickle_subset_filename,
force_semantic_analysis,
subset_size=subset_selection_size)
corr_matrix_subset = subset_correlation(subset_df, csv_correlation_filename, force_semantic_analysis)
cluster_dict = greedy_signal_clustering(corr_matrix_subset,
correlation_threshold=min_correlation_threshold,
fuzzy_labeling=fuzzy_labeling)
df_full, corr_matrix_full, cluster_dict = label_propagation(a_timer,
pickle_clusters_filename=pickle_clusters_filename,
pickle_all_signals_df_filename=pickle_all_signal_filename,
csv_signals_correlation_filename=csv_all_signals_filename,
signal_dict=signal_dictionary,
cluster_dict=cluster_dict,
correlation_threshold=min_correlation_threshold,
force=force_semantic_analysis)
signal_dictionary, j1979_correlations = j1979_signal_labeling(a_timer=a_timer,
j1979_corr_filename=pickle_j1979_correlation,
df_signals=df_full,
j1979_dict=j1979_dictionary,
signal_dict=signal_dictionary,
correlation_threshold=min_correlation_threshold,
force=force_signal_labeling)
plot_signals_by_cluster(a_timer, cluster_dict, signal_dictionary, use_j1979_tags_in_plots, i, force_cluster_plotting)
# DATA STORAGE #
if dump_to_pickle:
if force_pre_processing:
if path.isfile(pickle_arb_id_filename):
remove(pickle_arb_id_filename)
if path.isfile(pickle_j1979_filename):
remove(pickle_j1979_filename)
if force_lexical_analysis or force_signal_labeling:
if path.isfile(pickle_signal_filename):
remove(pickle_signal_filename)
if force_semantic_analysis:
if path.isfile(pickle_subset_filename):
remove(pickle_subset_filename)
if path.isfile(csv_correlation_filename):
remove(csv_correlation_filename)
if path.isfile(pickle_j1979_correlation):
remove(pickle_j1979_correlation)
if path.isfile(pickle_clusters_filename):
remove(pickle_clusters_filename)
if path.isfile(pickle_all_signal_filename):
remove(pickle_all_signal_filename)
if path.isfile(csv_all_signals_filename):
remove(csv_all_signals_filename)
timer_flag = 0
if not path.exists(output_folder):
mkdir(output_folder)
chdir(output_folder)
if not path.isfile(pickle_arb_id_filename):
timer_flag += 1
print("\nDumping arb ID dictionary to " + pickle_arb_id_filename)
dump(id_dictionary, open(pickle_arb_id_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_j1979_filename):
timer_flag += 1
print("\nDumping J1979 dictionary to " + pickle_j1979_filename)
dump(j1979_dictionary, open(pickle_j1979_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_signal_filename):
timer_flag += 1
print("\nDumping signal dictionary to " + pickle_signal_filename)
dump(signal_dictionary, open(pickle_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_subset_filename):
timer_flag += 1
print("\nDumping signal subset list to " + pickle_subset_filename)
dump(subset_df, open(pickle_subset_filename, "wb"))
print("\tComplete...")
if not path.isfile(csv_correlation_filename):
timer_flag += 1
print("\nDumping subset correlation matrix to " + csv_correlation_filename)
corr_matrix_subset.to_csv(csv_correlation_filename)
print("\tComplete...")
if not path.isfile(pickle_j1979_correlation):
timer_flag += 1
print("\nDumping J1979 correlation DataFrame to " + pickle_j1979_correlation)
dump(j1979_correlations, open(pickle_j1979_correlation, "wb"))
print("\tComplete...")
if not path.isfile(pickle_clusters_filename):
timer_flag += 1
print("\nDumping cluster dictionary to " + pickle_clusters_filename)
dump(cluster_dict, open(pickle_clusters_filename, "wb"))
print("\tComplete...")
if not path.isfile(pickle_all_signal_filename):
timer_flag += 1
print("\nDumping complete signals DataFrame to " + pickle_all_signal_filename)
dump(df_full, open(pickle_all_signal_filename, "wb"))
print("\tComplete...")
if not path.isfile(csv_all_signals_filename):
timer_flag += 1
print("\nDumping complete correlation matrix to " + csv_all_signals_filename)
corr_matrix_full.to_csv(csv_all_signals_filename)
print("\tComplete...")
if timer_flag is 9:
print("\nDumping pipeline timer to " + pickle_timer_filename)
dump(a_timer, open(pickle_timer_filename, "wb"))
print("\tComplete...")
chdir("..")

0
Pipeline/PipelineTimer.py Normal file → Executable file
View File

9
Pipeline/Plotter.py Normal file → Executable file
View File

@ -16,7 +16,10 @@ cluster_folder: str = 'clusters'
j1979_folder: str = 'j1979'
def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, force: bool=False):
def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, settings: int, force: bool = False):
arb_id_folder = 'figures' + str(settings)
if path.exists(arb_id_folder):
if force:
rmtree(arb_id_folder)
@ -29,7 +32,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
for k_id, signals in signal_dict.items():
arb_id = arb_id_dict[k_id]
if not arb_id.static:
print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
print(str(settings) + "Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ")")
a_timer.start_iteration_time()
signals_to_plot = []
@ -99,7 +102,9 @@ def plot_signals_by_cluster(a_timer: PipelineTimer,
cluster_dict: dict,
signal_dict: dict,
use_j1979_tags: bool,
settings: int,
force: bool=False):
cluster_folder = 'cluster' + str(settings)
if path.exists(cluster_folder):
if force:
rmtree(cluster_folder)

2
Pipeline/PreProcessor.py Normal file → Executable file
View File

@ -44,7 +44,7 @@ class PreProcessor:
header=None,
names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
skiprows=7,
delimiter='\t',
delimiter=' ',
converters=convert_dict,
index_col=0)

0
Pipeline/SemanticAnalysis.py Normal file → Executable file
View File

0
Pipeline/Signal.py Normal file → Executable file
View File

0
Pipeline_multi-file/ArbID.py Normal file → Executable file
View File

5
Pipeline_multi-file/FileBoi.py Normal file → Executable file
View File

@ -60,13 +60,14 @@ class FileBoi:
# Check if this file name matches the expected name for a CAN data sample. If so, create new Sample
m = re.match('loggerProgram[\d]+.log', file)
if m:
i = 0
if not (make, model, year) in sample_dict:
sample_dict[(make, model, year)] = []
this_sample_index = str(len(sample_dict[(make, model, year)]))
this_sample = Sample(make=make, model=model, year=year, sample_index=this_sample_index,
sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n)
sample_path=dirName + "/" + m.group(0), kfold_n=kfold_n)
sample_dict[(make, model, year)].append(this_sample)
current_vehicle = []
current_vehicle = []
else:
if this_dir == "Captures":
continue

0
Pipeline_multi-file/J1979.py Normal file → Executable file
View File

View File

@ -0,0 +1,130 @@
from numpy import float64, nditer, uint64, zeros, ndarray, inf
from pandas import Series, DataFrame
from os import path, remove
from pickle import load
from ArbID import ArbID
from Signal import Signal
from PipelineTimer import PipelineTimer
from typing import List
from scipy import integrate
def transform_signal(a_timer: PipelineTimer,
arb_id_dict: dict,
signal_dict: dict,
transform_pickle_filename: str,
normalize_strategy,
given_arb_id: int,
force=False):
if force and path.isfile(transform_pickle_filename):
remove(transform_pickle_filename)
if path.isfile(transform_pickle_filename):
print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
return load(open(transform_pickle_filename, "rb"))
a_timer.start_function_time()
transform_dict = signal_dict
# arb_id_dict[given_arb_id * 256] = ArbID(given_arb_id * 256)
for k, arb_id in arb_id_dict.items():
# print(str(arb_id.id) + " == " + str(given_arb_id) + " ?\n")
if arb_id.id == given_arb_id:
arb_id.static = False
arb_id.short = False
if not arb_id.static:
for token in arb_id.tokenization:
a_timer.start_iteration_time()
signal = Signal(k * 256, token[0], token[1])
signal.static = False
# Convert the binary ndarray to a list of string representations of each row
temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
temp2 = zeros((temp1.__len__()+1), dtype=uint64)
# convert each string representation to int
for i, row in enumerate(temp1):
temp2[i] = int(row, 2)
temp3 = integrate.cumtrapz(temp2)
print("Arb Id " + str(k) + ", Signal from " + str(token[0]) + " to " + str(token[1]) + " Integrated successfully")
# create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
# Normalize the signal and update its meta-data
signal.normalize_and_set_metadata(normalize_strategy)
# add this signal to the signal dictionary which is keyed by Arbitration ID
if (k * 256) in transform_dict:
transform_dict[k * 256][(arb_id.id * 256, signal.start_index, signal.stop_index)] = signal
else:
print("Successfully added at transform dict")
transform_dict[k * 256] = {(arb_id.id * 256, signal.start_index, signal.stop_index): signal}
a_timer.set_token_to_signal()
a_timer.set_signal_generation()
return transform_dict
def transform_signals(a_timer: PipelineTimer,
arb_id_dict: dict,
transform_pickle_filename: str,
normalize_strategy,
force=False):
if force and path.isfile(transform_pickle_filename):
remove(transform_pickle_filename)
if path.isfile(transform_pickle_filename):
print("\nSignal transformation already completed and forcing is turned off. Using pickled data...")
return load(open(transform_pickle_filename, "rb"))
a_timer.start_function_time()
transform_dict = {} # arb_id_dict
for k, arb_id in arb_id_dict.items():
if not arb_id.static:
for token in arb_id.tokenization:
a_timer.start_iteration_time()
signal = Signal(k * 256, token[0], token[1])
# Convert the binary ndarray to a list of string representations of each row
temp1 = [''.join(str(x) for x in row) for row in arb_id.boolean_matrix[:, token[0]:token[1] + 1]]
temp2 = zeros((temp1.__len__()+1), dtype=uint64)
# convert each string representation to int
for i, row in enumerate(temp1):
temp2[i] = int(row, 2)
temp3 = integrate.cumtrapz(temp2)
# create an unsigned integer pandas.Series using the time index from this Arb ID's original data.
signal.time_series = Series(temp3[:], index=arb_id.original_data.index, dtype=float64)
# Normalize the signal and update its meta-data
signal.normalize_and_set_metadata(normalize_strategy)
# add this signal to the signal dictionary which is keyed by Arbitration ID
if k in transform_dict:
transform_dict[k][(arb_id.id, signal.start_index, signal.stop_index)] = signal
else:
transform_dict[k] = {(arb_id.id, signal.start_index, signal.stop_index): signal}
a_timer.set_token_to_signal()
a_timer.set_signal_generation()
return transform_dict

0
Pipeline_multi-file/LexicalAnalysis.py Normal file → Executable file
View File

30
Pipeline_multi-file/Main.py Normal file → Executable file
View File

@ -5,14 +5,22 @@ from Sample import Sample
# Cross validation parameters for finding an optimal tokenization inversion distance threshold -- NOT WORKING?
kfold_n: int = 5
current_vehicle_number = 0
known_speed_arb_id = 514
good_boi = FileBoi()
samples = good_boi.go_fetch(kfold_n)
for key, sample_list in samples.items(): # type: tuple, list
for sample in sample_list: # type: Sample
print(current_vehicle_number)
# sample.tang_inversion_bit_dist += (0.01 * current_vehicle_number)
# sample.max_inter_cluster_dist += (0.01 * current_vehicle_number)
# sample.tang_inversion_bit_dist = round(sample.tang_inversion_bit_dist, 2) # removes floating point errors
# sample.max_inter_cluster_dist = round(sample.max_inter_cluster_dist, 2)
# print("\n\t##### Settings are " + str(sample.tang_inversion_bit_dist) + " and " + str(
# sample.max_inter_cluster_dist) + " #####")
print("\nData import and Pre-Processing for " + sample.output_vehicle_dir)
id_dict, j1979_dict = sample.pre_process()
id_dict, j1979_dict = sample.pre_process(known_speed_arb_id)
if j1979_dict:
sample.plot_j1979(j1979_dict, vehicle_number=str(current_vehicle_number))
@ -25,14 +33,22 @@ for key, sample_list in samples.items(): # type: tuple, list
print("\n\t##### BEGINNING LEXICAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
sample.tokenize_dictionary(id_dict)
signal_dict = sample.generate_signals(id_dict, bool(j1979_dict))
sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
# sample.plot_arb_ids(id_dict, signal_dict, vehicle_number=str(current_vehicle_number))
# LEXICAL ANALYSIS #
# KNOWN SIGNAL ANALYSIS #
print("\n\t##### BEGINNING KNOWN SIGNAL ANALYSIS OF " + sample.output_vehicle_dir + " #####")
transform_dict= sample.transform_signal(id_dict, signal_dict, known_speed_arb_id)
sample.plot_arb_ids(id_dict, transform_dict, vehicle_number=str(current_vehicle_number))
# SEMANTIC ANALYSIS #
print("\n\t##### BEGINNING SEMANTIC ANALYSIS OF " + sample.output_vehicle_dir + " #####")
corr_matrix, combined_df = sample.generate_correlation_matrix(signal_dict)
corr_matrix, combined_df = sample.generate_correlation_matrix(transform_dict)
if j1979_dict:
signal_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, signal_dict, combined_df)
transform_dict, j1979_correlation = sample.j1979_labeling(j1979_dict, transform_dict, combined_df)
cluster_dict, linkage_matrix = sample.cluster_signals(corr_matrix)
sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
# sample.plot_clusters(cluster_dict, signal_dict, bool(j1979_dict), vehicle_number=str(current_vehicle_number))
sample.plot_known_signal_cluster(cluster_dict, signal_dict, bool(j1979_dict), known_speed_arb_id, vehicle_number=str(current_vehicle_number))
sample.plot_dendrogram(linkage_matrix, vehicle_number=str(current_vehicle_number))
current_vehicle_number += 1

0
Pipeline_multi-file/PipelineTimer.py Normal file → Executable file
View File

165
Pipeline_multi-file/Plotter.py Normal file → Executable file
View File

@ -25,13 +25,13 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
rmtree(arb_id_folder)
else:
print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
return
# return
a_timer.start_function_time()
for k_id, signals in signal_dict.items():
arb_id = arb_id_dict[k_id]
if not arb_id.static and not arb_id.short:
if (not arb_id.static and not arb_id.short) or k_id == 155136:
print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
a_timer.start_iteration_time()
@ -85,7 +85,7 @@ def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dic
chdir(arb_id_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig(hex(arb_id.id) + "." + figure_format,
savefig(hex(signal.arb_id) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
@ -311,3 +311,162 @@ def plot_dendrogram(a_timer: PipelineTimer,
transparent=figure_transp)
plt.close()
print("\t\tComplete...")
def plot_known_signal_cluster(a_timer: PipelineTimer,
cluster_dict: dict,
signal_dict: dict,
use_j1979_tags: bool,
vehicle_number: str,
given_arb_id: int,
force: bool = False):
if path.exists(cluster_folder):
if force:
rmtree(cluster_folder)
else:
print("\nCluster plotting appears to have already been done and forcing is turned off. Skipping...")
return
a_timer.start_function_time()
print("\n")
for cluster_number, list_of_signals in cluster_dict.items():
if [v for i, v in enumerate(list_of_signals) if (v[0] == given_arb_id or v[0] == given_arb_id * 256)]:
print("Plotting cluster", cluster_number, "with " + str(len(list_of_signals)) + " signals.")
a_timer.start_iteration_time()
# Setup the plot
fig, axes = plt.subplots(nrows=len(list_of_signals), ncols=1, squeeze=False)
plt.suptitle("Signal Cluster " + str(cluster_number) + " from Vehicle " + vehicle_number,
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + len(list_of_signals)+1) * 1.3)
size_adjust = len(list_of_signals) / 100
# The min() statement provides whitespace for the suptitle depending on the number of subplots.
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
# Plot the time series of each signal in the cluster
for i, signal_key in enumerate(list_of_signals):
signal = signal_dict[signal_key[0]][signal_key]
ax = axes[i, 0]
if signal.j1979_title and use_j1979_tags:
this_title = signal.plot_title + " [" + signal.j1979_title + \
" (PCC:" + str(round(signal.j1979_pcc, 2)) + ")]"
else:
this_title = signal.plot_title
ax.set_title(this_title,
style='italic',
size='medium')
ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
ax.plot(signal.time_series, color='black')
if not path.exists(cluster_folder):
mkdir(cluster_folder)
chdir(cluster_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
if len(list_of_signals) < 100: # prevents errors when given too low a setting for correlation
savefig("cluster_" + str(cluster_number) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
else:
print("Too many clusters to plot! Skipping...")
chdir("..")
plt.close(fig)
a_timer.set_plot_save_cluster()
print("\tComplete...")
a_timer.set_plot_save_cluster_dict()
def plot_signals_by_arb_id(a_timer: PipelineTimer, arb_id_dict: dict, signal_dict: dict, vehicle_number: str,
force: bool=False):
if path.exists(arb_id_folder):
if force:
rmtree(arb_id_folder)
else:
print("\nArbID plotting appears to have already been done and forcing is turned off. Skipping...")
# return
a_timer.start_function_time()
for k_id, signals in signal_dict.items():
arb_id = arb_id_dict[k_id]
if (not arb_id.static and not arb_id.short) or k_id == 155136:
print("Plotting Arb ID " + str(k_id) + " (" + str(hex(k_id)) + ") for Vehicle " + vehicle_number)
a_timer.start_iteration_time()
signals_to_plot = []
# Don't plot the static signals
for k_signal, signal in signals.items():
if not signal.static:
signals_to_plot.append(signal)
# There's a corner case where the Arb ID only has static signals. This conditional accounts for this.
# TODO: This corner case should probably be reflected by arb_id.static.
if len(signals_to_plot) < 1:
continue
# One row per signal plus one for the TANG. Squeeze is used to force axes to be an array to avoid errors.
fig, axes = plt.subplots(nrows=1 + len(signals_to_plot), ncols=1)
plt.suptitle("Time Series and TANG for Arbitration ID " + hex(k_id) + " from Vehicle " + vehicle_number,
weight='bold',
position=(0.5, 1))
fig.set_size_inches(8, (1 + len(signals_to_plot) + 1) * 1.3)
# The min() statement provides whitespace for the title depending on the number of subplots.
size_adjust = len(signals_to_plot) / 100
plt.tight_layout(h_pad=1, rect=(0, 0, 1, min(0.985, 0.93 + size_adjust)))
# This adjusts whitespace padding on the left and right of the subplots
fig.subplots_adjust(left=0.07, right=0.98)
for i, signal in enumerate(signals_to_plot):
ax = axes[i]
ax.set_title(signal.plot_title,
style='italic',
size='medium')
ax.set_xlim([signal.time_series.first_valid_index(), signal.time_series.last_valid_index()])
ax.plot(signal.time_series, color='black')
# Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
axes[-1].axvline(x=signal.start_index, alpha=0.25, c='black', linestyle='dashed')
# Plot the entropy gradient at the bottom of the overall output
ax = axes[-1]
ax.set_title("Min-Max Normalized Transition Aggregation N-Gram (TANG)",
style='italic',
size='medium')
tang_bit_width = arb_id.tang.shape[0]
ax.set_xlim([-0.01 * tang_bit_width, 1.005 * tang_bit_width])
y = arb_id.tang[:]
# Differentiate bit positions with non-zero and zero entropy using black points and grey x respectively.
ix = isin(y, 0)
pad_bit = where(ix)
non_pad_bit = where(~ix)
ax.scatter(non_pad_bit, y[non_pad_bit], color='black', marker='o', s=10)
ax.scatter(pad_bit, y[pad_bit], color='grey', marker='^', s=10)
if not path.exists(arb_id_folder):
mkdir(arb_id_folder)
chdir(arb_id_folder)
# If you want transparent backgrounds, a different file format, etc. then change these settings accordingly.
savefig(hex(signal.arb_id) + "." + figure_format,
bbox_iches='tight',
pad_inches=0.0,
dpi=figure_dpi,
format=figure_format,
transparent=figure_transp)
chdir("..")
plt.close(fig)
a_timer.set_plot_save_arb_id()
print("\tComplete...")
a_timer.set_plot_save_arb_id_dict()

10
Pipeline_multi-file/PreProcessor.py Normal file → Executable file
View File

@ -1,4 +1,4 @@
from pandas import DataFrame, read_csv, Series
from pandas import DataFrame, read_csv, Series, concat
from numpy import int64
from os import path, remove, getcwd
from pickle import load
@ -45,7 +45,7 @@ class PreProcessor:
header=None,
names=['time', 'id', 'dlc', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
skiprows=7,
delimiter='\t',
delimiter=' ',
converters=convert_dict,
index_col=0)
@ -70,6 +70,7 @@ class PreProcessor:
time_conversion: int = 1000,
freq_analysis_accuracy: float = 0.0,
freq_synchronous_threshold: float = 0.0,
given_arb_id: int = 0,
force: bool = False) -> (dict, dict):
id_dictionary = {}
j1979_dictionary = {}
@ -92,6 +93,11 @@ class PreProcessor:
return id_dictionary, j1979_dictionary
else:
self.import_csv(a_timer, self.data_filename)
this_id = self.data.loc[self.data['id'] == given_arb_id].copy()
this_id.id = given_arb_id * 256
combined = concat([self.data, this_id])
self.data = combined
a_timer.start_function_time()

52
Pipeline_multi-file/Sample.py Normal file → Executable file
View File

@ -2,7 +2,7 @@ from PreProcessor import PreProcessor
from Validator import Validator
from LexicalAnalysis import tokenize_dictionary, generate_signals
from SemanticAnalysis import generate_correlation_matrix, signal_clustering, j1979_signal_labeling
from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram
from Plotter import plot_j1979, plot_signals_by_arb_id, plot_signals_by_cluster, plot_dendrogram, plot_known_signal_cluster
from sklearn.preprocessing import minmax_scale
from typing import Callable
from PipelineTimer import PipelineTimer
@ -11,6 +11,8 @@ from pickle import dump, load
from numpy import ndarray, zeros, float16
from pandas import DataFrame
from KnownSignalAnalysis import transform_signals, transform_signal
# File names for the on-disc data input and output.
output_folder: str = 'output'
pickle_arb_id_filename: str = 'pickleArbIDs.p'
@ -26,6 +28,8 @@ pickle_combined_df_filename: str = 'pickleCombinedDataFrame.p'
csv_all_signals_filename: str = 'complete_correlation_matrix.csv'
pickle_timer_filename: str = 'pickleTimer.p'
pickle_transform_filename: str = 'pickleTransform'
dump_to_pickle: bool = True
# Change out the normalization strategies as needed.
@ -39,9 +43,11 @@ force_threshold_plotting: bool = False
force_j1979_plotting: bool = True
use_j1979: bool = True
force_transform: bool = False
force_lexical_analysis: bool = False
force_signal_generation: bool = False
force_arb_id_plotting: bool = True
force_arb_id_plotting: bool = False
force_correlation_matrix: bool = False
force_clustering: bool = False
@ -58,16 +64,15 @@ freq_synchronous_threshold = 0.1
# Threshold parameters used during lexical analysis.
tokenization_bit_distance: float = 0.2
tokenize_padding: bool = True
tokenize_padding: bool = False # changing this to false seems to help better find weak signals
merge_tokens: bool = True
# Threshold parameters used during semantic analysis
subset_selection_size: float = 0.25
max_intra_cluster_distance: float = 0.20
max_intra_cluster_distance: float = 0.10 # normally 0.25
min_j1979_correlation: float = 0.85
# fuzzy_labeling: bool = True
# A timer class to record timings throughout the pipeline.
a_timer = PipelineTimer(verbose=True)
@ -112,7 +117,7 @@ class Sample:
# Move back to root of './output/make_model_year/sample_index/"
chdir("../../../")
def pre_process(self):
def pre_process(self, given_arb_id):
self.make_and_move_to_vehicle_directory()
pre_processor = PreProcessor(self.path, pickle_arb_id_filename, pickle_j1979_filename, self.use_j1979)
id_dictionary, j1979_dictionary = pre_processor.generate_arb_id_dictionary(a_timer,
@ -120,6 +125,7 @@ class Sample:
time_conversion,
freq_analysis_accuracy,
freq_synchronous_threshold,
given_arb_id,
force_pre_processing)
if dump_to_pickle:
if force_pre_processing:
@ -303,3 +309,37 @@ class Sample:
plot_dendrogram(a_timer=a_timer, linkage_matrix=linkage_matrix, threshold=self.max_inter_cluster_dist,
vehicle_number=vehicle_number, force=force_dendrogram_plotting)
self.move_back_to_parent_directory()
def transform_signals(self, id_dictionary: dict):
self.make_and_move_to_vehicle_directory()
transform_dict = transform_signals(a_timer=a_timer,
arb_id_dict=id_dictionary,
transform_pickle_filename=pickle_transform_filename,
normalize_strategy=signal_normalize_strategy,
force=force_transform)
self.move_back_to_parent_directory()
return transform_dict
def transform_signal(self, id_dictionary: dict, signal_dict: dict, arb_id: int):
self.make_and_move_to_vehicle_directory()
transform_dict = transform_signal(a_timer=a_timer,
arb_id_dict=id_dictionary,
signal_dict=signal_dict,
transform_pickle_filename=pickle_transform_filename,
normalize_strategy=signal_normalize_strategy,
given_arb_id=arb_id,
force=force_transform)
self.move_back_to_parent_directory()
return transform_dict
def plot_known_signal_cluster(self, cluster_dictionary: dict, signal_dictionary: dict, use_j1979_tags: bool,
known_signal: int, vehicle_number: str):
self.make_and_move_to_vehicle_directory()
plot_known_signal_cluster(a_timer=a_timer,
cluster_dict=cluster_dictionary,
signal_dict=signal_dictionary,
use_j1979_tags=use_j1979_tags,
vehicle_number=vehicle_number,
given_arb_id=known_signal,
force=force_cluster_plotting)
self.move_back_to_parent_directory()

4
Pipeline_multi-file/SemanticAnalysis.py Normal file → Executable file
View File

@ -1,5 +1,5 @@
from pandas import concat, DataFrame, read_csv
from numpy import ndarray, zeros
from numpy import ndarray, zeros, clip
from os import path, remove
from pickle import load, dump
from ast import literal_eval
@ -77,7 +77,7 @@ def signal_clustering(corr_matrix: DataFrame,
corr_matrix.where(corr_matrix > 0, 0, inplace=True)
corr_matrix = 1 - corr_matrix
X = corr_matrix.values # type: ndarray
Y = ssd.squareform(X)
Y = clip(ssd.squareform(X), 0, None)
# Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
Z = linkage(Y, method='single', optimal_ordering=True)
fclus = fcluster(Z, t=threshold, criterion='distance')

0
Pipeline_multi-file/Signal.py Normal file → Executable file
View File

0
Pipeline_multi-file/Validator.py Normal file → Executable file
View File

0
Pipeline_multi-file/maximize_sum_shannon.py Normal file → Executable file
View File