459 lines
14 KiB
Python
459 lines
14 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.10.17"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def header():
|
|
import marimo as mo
|
|
|
|
notebook_name = 'prepare_perfspec.py'
|
|
|
|
from lib_perfspec import perfspec_vars
|
|
(_,_defs) = perfspec_vars.run()
|
|
perfspec = _defs['perfspec']
|
|
|
|
from lib_perfspec import perfspec_header
|
|
(_,_defs) = perfspec_header.run()
|
|
lib_header = _defs['header']
|
|
lib_intro = _defs['intro']
|
|
|
|
mo.md(
|
|
f"""
|
|
{lib_header(notebook_name)}
|
|
|
|
## Prepare data to train **{perfspec['app']['train_mode']}** model
|
|
"""
|
|
)
|
|
return (
|
|
lib_header,
|
|
lib_intro,
|
|
mo,
|
|
notebook_name,
|
|
perfspec,
|
|
perfspec_header,
|
|
perfspec_vars,
|
|
)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def imports():
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import json
|
|
import pandas as pd
|
|
return Path, json, np, pd
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def intro(Path, lib_intro, mo, notebook_name, perfspec):
|
|
verbose = perfspec['settings']['verbose']
|
|
perfspec['vars'] = {}
|
|
|
|
from lib_perfspec import perfspec_args
|
|
(_,_defs) = perfspec_args.run()
|
|
|
|
if not Path(perfspec['defaults']['data_dirpath']).exists():
|
|
exit(f"data dir path not found: {perfspec['defaults']['data_dirpath']}")
|
|
|
|
mo.md(
|
|
f"""
|
|
{lib_intro(notebook_name)}
|
|
|
|
"""
|
|
)
|
|
return perfspec_args, verbose
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def load_raw_logs(Path, mo, pd, perfspec):
|
|
def load_raw_logs(filepath):
|
|
file_path = Path(filepath)
|
|
|
|
# Check if the file exists using Path
|
|
if not file_path.exists():
|
|
exit(f"File not found: {filepath}")
|
|
|
|
# Set the chunk size (number of rows to process at a time)
|
|
chunk_size = 1000 # Adjust based on your available memory
|
|
|
|
# Create an empty list to hold the chunks
|
|
chunks = []
|
|
|
|
# Iterate over the file in chunks
|
|
try:
|
|
for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size, encoding_errors='ignore'):
|
|
# Append each chunk to the list
|
|
chunks.append(chunk)
|
|
except ValueError as e:
|
|
print(f"Error while parsing JSON: {e}")
|
|
|
|
# Combine all chunks into a single DataFrame
|
|
df = pd.concat(chunks, ignore_index=True)
|
|
|
|
#df['user_parsed'] = df['user'].apply(json.loads)
|
|
#df_exploded = df.explode('user')
|
|
|
|
# Normalize the JSON structure to flatten it
|
|
df_normalized = pd.json_normalize(
|
|
df.to_dict(orient='records'), # Convert the DataFrame to a list of records
|
|
sep='_'
|
|
)
|
|
if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
|
|
print (f"Loaded {len(df_normalized)} rows from raw logs ")
|
|
perfspec['vars']['df_raw_data']=df_normalized
|
|
|
|
load_raw_logs(perfspec['settings']['raw_audit_log'])
|
|
|
|
mo.md(
|
|
f"""
|
|
## {mo.icon('lucide:database', color="green")} Load raw logs into a Dataset
|
|
|
|
Loading raw data logs from: {perfspec['settings']['raw_audit_log']}
|
|
"""
|
|
)
|
|
return (load_raw_logs,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def create_main_audit_logs(Path, mo, perfspec):
|
|
def create_main_audit_logs(df_normalized,outputfile):
|
|
# List of fields to remove
|
|
remove_fields = [
|
|
"apiVersion", "level", "sourceIPs", "kind",
|
|
"annotations", "stageTimestamp", "userAgent"
|
|
]
|
|
|
|
# List of fields to keep
|
|
keep_fields = [
|
|
"requestReceivedTimestamp", "user_username", "verb",
|
|
"objectRef_resource", "objectRef_subresource", "objectRef_name",
|
|
"requestURI", "auditID", "stage",
|
|
"responseStatus_code", "objectRef_uid",
|
|
]
|
|
|
|
# Remove unwanted fields (drop them from the DataFrame if they exist)
|
|
df_cleaned = df_normalized.drop(columns=[field for field in remove_fields if field in df_normalized.columns], errors='ignore')
|
|
|
|
# Select only the fields you want to keep (ensure that they exist in the DataFrame)
|
|
df_final = df_cleaned[keep_fields].copy()
|
|
|
|
# Display the final DataFrame
|
|
#print("Final DataFrame with only the selected fields:")
|
|
#print(df_final.head())
|
|
|
|
# Define the output path for the JSON file
|
|
output_file = Path(outputfile)
|
|
|
|
if Path(output_file).exists():
|
|
output_file.unlink()
|
|
|
|
# Write the DataFrame to JSON
|
|
df_final.to_json(output_file, orient='records', lines=True, force_ascii=False)
|
|
if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
|
|
print(f"Main audit log created in {output_file}")
|
|
|
|
create_main_audit_logs(perfspec['vars']['df_raw_data'], perfspec['settings']['main_audit_log'])
|
|
|
|
mo.md(
|
|
f"""
|
|
## {mo.icon('lucide:scroll', color="red")} Cleanup Dataset
|
|
|
|
Create a main **audit log** as starting point for operations
|
|
|
|
Final log will be in {perfspec['settings']['main_audit_log']}
|
|
"""
|
|
)
|
|
return (create_main_audit_logs,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def genereate_actions_data(Path, mo, pd, perfspec):
|
|
def generate_actions_data(filepath):
|
|
file_path = Path(filepath)
|
|
if not file_path.exists():
|
|
exit(f"File path: {filepath} not exists")
|
|
|
|
df_actions = pd.read_json(file_path, orient='records', lines=True)
|
|
|
|
names=['requestReceivedTimestamp', 'user_username', 'verb',
|
|
'objectRef_resource', 'objectRef_subresource', 'objectRef_name',
|
|
'requestURI', 'auditID', 'stage', 'responseStatus_code', "objectRef_uid"]
|
|
|
|
# Assign the column names to the DataFrame
|
|
df_actions.columns = names
|
|
|
|
#print(df_actions.count)
|
|
df_actions = df_actions.drop_duplicates(ignore_index=True)
|
|
|
|
#df_actions = df_actions.drop(df_actions.columns[[]], axis=1)
|
|
df_actions = df_actions[
|
|
(df_actions['verb'] != "get") &
|
|
(df_actions['verb'] != "watch") &
|
|
(df_actions['verb'] != "list") &
|
|
(df_actions['objectRef_resource'] != "events") &
|
|
(df_actions['objectRef_resource'] != "leases")
|
|
]
|
|
|
|
df_actions = df_actions[df_actions.objectRef_resource != "replicationcontrollers"]
|
|
|
|
df_actions["event_type"] = df_actions["verb"] + "_" + df_actions["objectRef_resource"]
|
|
|
|
#df_actions = df_actions.drop_duplicates()
|
|
#print(df_actions.to_string())
|
|
#print(df_actions.count)
|
|
|
|
perfspec['vars']['df_actions_dataset'] = df_actions
|
|
if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
|
|
print(f"Main audit log prepared for actions data with: {df_actions.count()} rows")
|
|
|
|
generate_actions_data(perfspec['settings']['main_audit_log'])
|
|
mo.md(
|
|
f"""
|
|
## {mo.icon('lucide:database', color="green")} Load audit logs for Actions
|
|
|
|
Loading **main audit data logs** from: {perfspec['settings']['main_audit_log']}
|
|
"""
|
|
)
|
|
return (generate_actions_data,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def save_actions_data(Path, mo, perfspec):
|
|
def save_actions_data(df_audit_logs,filepath):
|
|
file_path = Path(filepath)
|
|
if file_path.exists():
|
|
#file_path.unlink()
|
|
print(f"File path already exist: {filepath} DELETE to process")
|
|
return
|
|
|
|
event_seq = []
|
|
event_sub_seq = []
|
|
|
|
audit_filepath = Path(filepath)
|
|
if Path(audit_filepath).exists():
|
|
audit_filepath.unlink()
|
|
for c, r in df_audit_logs.iterrows():
|
|
with open(audit_filepath, "a") as event_file:
|
|
event_file.write("%s\n" % r['event_type'])
|
|
|
|
save_actions_data(perfspec['vars']['df_actions_dataset'],perfspec['settings']['actions_filepath'])
|
|
mo.md(
|
|
f"""
|
|
### {mo.icon('lucide:save', color="green")} Save Actions Data
|
|
|
|
Save **actions data** logs in: {perfspec['settings']['actions_filepath']}
|
|
|
|
> If file exists it will not be deleted or overwritted, basically as a **trainded models data source**
|
|
|
|
"""
|
|
)
|
|
return (save_actions_data,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def plot_available_fonts(mo, perfspec):
|
|
def available_fonts():
|
|
import matplotlib.font_manager
|
|
|
|
# List all available fonts in Matplotlib
|
|
available_fonts = [f.name for f in matplotlib.font_manager.fontManager.ttflist]
|
|
return (available_fonts)
|
|
|
|
if perfspec['settings']['verbose'] == 'dev':
|
|
mo.md(
|
|
f"""
|
|
### Matplot available fonts
|
|
|
|
Fonts: {available_fonts()}
|
|
|
|
"""
|
|
)
|
|
return (available_fonts,)
|
|
|
|
|
|
@app.cell
|
|
def main(mo):
|
|
mo.md("""<a id='main' />""")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def actions_distrib(mo, perfspec):
|
|
mo.md(
|
|
f"""
|
|
# {mo.icon('lucide:chart-spline', color="orange")} Plot actions distribution
|
|
|
|
Show how **Resources** and critical **methods** are concentrated or dristributes
|
|
|
|
A distribution map is generated with plot graphic and saved from **accions-dataset**<br>
|
|
to {perfspec['settings']['actions_distrib_filepath']} <br>
|
|
usando formato {perfspec['settings']['actions_distrib_format']} <br>
|
|
)
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def plot_actions_distrib(copy, mo, np, perfspec):
|
|
def actions_distrib(filename, output_path, output_format):
|
|
import re
|
|
from collections import defaultdict, Counter
|
|
from tqdm import trange, tqdm
|
|
import time
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib
|
|
import textwrap
|
|
|
|
font_size = 17
|
|
|
|
# Set the font to a specific one that you know is available on your system
|
|
matplotlib.rcParams['font.family'] = 'DejaVu Serif'
|
|
|
|
raw_text = open(filename, 'r', encoding='utf-8').read()
|
|
raw_words = raw_text.replace('\n', ',').split(',')[:-1]
|
|
|
|
def label_wrap(labels):
|
|
work_labels = copy.deepcopy(labels)
|
|
for i, label in enumerate(work_labels):
|
|
work_labels[i] = "\n".join(textwrap.wrap(label,width=15))
|
|
return work_labels
|
|
|
|
words = []
|
|
event_filter = []
|
|
# Example of filtering out some unwanted events
|
|
for event in raw_words:
|
|
if not('collection' in event):
|
|
words.append(event)
|
|
|
|
verbs = []
|
|
resources = []
|
|
|
|
counter = Counter(words)
|
|
|
|
for word in words:
|
|
verb, resource = word.split('_')[0], word.split('_')[1]
|
|
verbs.append(verb)
|
|
resources.append(resource)
|
|
|
|
# verbs = verbs[:200]
|
|
# resources = resources[:200]
|
|
|
|
counter_verbs = Counter(verbs)
|
|
counter_resources = Counter(resources)
|
|
|
|
verbs_set = list(set(verbs))
|
|
resources_set = list(set(resources))
|
|
|
|
verbs_set.sort()
|
|
resources_set.sort(reverse=True)
|
|
|
|
verbs = [e for e in verbs_set for k in resources_set]
|
|
resources = [e for k in verbs_set for e in resources_set]
|
|
|
|
# color_verb = [counter[e+"_"+k] for e in verbs_set for k in resources_set]
|
|
area_resource = [3*counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))]
|
|
texts = [counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))]
|
|
|
|
plt.rcParams.update({'font.size': font_size})
|
|
|
|
fig = plt.figure(figsize=(9, 9), dpi=100)
|
|
|
|
ax=fig.add_subplot(111, label="1")
|
|
|
|
ax.scatter(verbs, resources, s=area_resource, alpha=0.4, color='gray')
|
|
|
|
|
|
ax.set_xlabel("Methods", fontsize=font_size)
|
|
ax.set_xticks(verbs_set)
|
|
ax.set_xticklabels(verbs_set, fontsize=font_size-4, linespacing=1.0)
|
|
ax.xaxis.tick_bottom()
|
|
ax.set_xlim(-0.5,3.5)
|
|
|
|
ax.set_ylabel("Resources", fontsize=font_size)
|
|
ax.set_yticks(resources_set)
|
|
ax.set_yticklabels(resources_set, fontsize=font_size-8, linespacing=1.0)
|
|
ax.yaxis.tick_left()
|
|
|
|
for j in range(len(verbs)):
|
|
if texts[j] > 20:
|
|
plt.annotate(str(texts[j]), (verbs[j], resources[j]), ha='center', va='center', fontsize=np.interp(texts[j], [20, 2234], [8, 20]))
|
|
|
|
|
|
plt.rcParams['grid.linestyle'] = 'dotted'
|
|
plt.rcParams['grid.alpha'] = 0.3
|
|
|
|
plt.grid()
|
|
# plt.show()
|
|
fig.savefig(output_path, format=output_format, bbox_inches="tight")
|
|
perfspec['vars']['actions_distrib'] = counter
|
|
return plt
|
|
|
|
_plt = actions_distrib(
|
|
perfspec['settings']['actions_filepath'],
|
|
perfspec['settings']['actions_distrib_filepath'],
|
|
perfspec['settings']['actions_distrib_format']
|
|
)
|
|
|
|
|
|
if 'actions_distrib' in perfspec['vars'] and len(perfspec['vars']['actions_distrib'].items()) > 0:
|
|
mo.md(
|
|
f"""
|
|
|
|
## Plot actions distribution
|
|
|
|
{mo.as_html(_plt.show())}
|
|
|
|
## Plot actions distribution
|
|
|
|
"""
|
|
)
|
|
return (actions_distrib,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def review_actions_distrib(mo, pd, perfspec):
|
|
def df_actions_table():
|
|
if len(perfspec['vars']['actions_distrib'].items()) > 0:
|
|
df = pd.DataFrame(perfspec['vars']['actions_distrib'].items(), columns=['Action', 'Count'])
|
|
count_filter = mo.ui.slider(start=0, stop=100, value=50, label="Max age")
|
|
count_filter
|
|
#_transform_df = mo.ui.dataframe(_df)
|
|
transform_df = mo.ui.table(df,selection="multi")
|
|
return transform_df
|
|
else:
|
|
return None
|
|
|
|
transform_df = df_actions_table()
|
|
mo.md(
|
|
f"""
|
|
|
|
## Review actions distribution
|
|
|
|
{transform_df}
|
|
"""
|
|
)
|
|
return df_actions_table, transform_df
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def select_actions_distrib(mo, transform_df):
|
|
mo.md(
|
|
f"""
|
|
## Select actions distribution
|
|
{mo.md(f"Selecte value: {mo.ui.table(transform_df.value)}")}
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|