import marimo __generated_with = "0.10.17" app = marimo.App(width="medium") @app.cell(hide_code=True) def header(): import marimo as mo notebook_name = 'prepare_perfspec.py' from lib_perfspec import perfspec_vars (_,_defs) = perfspec_vars.run() perfspec = _defs['perfspec'] from lib_perfspec import perfspec_header (_,_defs) = perfspec_header.run() lib_header = _defs['header'] lib_intro = _defs['intro'] mo.md( f""" {lib_header(notebook_name)} ## Prepare data to train **{perfspec['app']['train_mode']}** model """ ) return ( lib_header, lib_intro, mo, notebook_name, perfspec, perfspec_header, perfspec_vars, ) @app.cell(hide_code=True) def imports(): from pathlib import Path import numpy as np import json import pandas as pd return Path, json, np, pd @app.cell(hide_code=True) def intro(Path, lib_intro, mo, notebook_name, perfspec): verbose = perfspec['settings']['verbose'] perfspec['vars'] = {} from lib_perfspec import perfspec_args (_,_defs) = perfspec_args.run() if not Path(perfspec['defaults']['data_dirpath']).exists(): exit(f"data dir path not found: {perfspec['defaults']['data_dirpath']}") mo.md( f""" {lib_intro(notebook_name)} """ ) return perfspec_args, verbose @app.cell(hide_code=True) def load_raw_logs(Path, mo, pd, perfspec): def load_raw_logs(filepath): file_path = Path(filepath) # Check if the file exists using Path if not file_path.exists(): exit(f"File not found: {filepath}") # Set the chunk size (number of rows to process at a time) chunk_size = 1000 # Adjust based on your available memory # Create an empty list to hold the chunks chunks = [] # Iterate over the file in chunks try: for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size, encoding_errors='ignore'): # Append each chunk to the list chunks.append(chunk) except ValueError as e: print(f"Error while parsing JSON: {e}") # Combine all chunks into a single DataFrame df = pd.concat(chunks, ignore_index=True) #df['user_parsed'] = df['user'].apply(json.loads) #df_exploded = df.explode('user') # Normalize the JSON structure to flatten it df_normalized = pd.json_normalize( df.to_dict(orient='records'), # Convert the DataFrame to a list of records sep='_' ) if perfspec['settings']['verbose'] != None or mo.running_in_notebook(): print (f"Loaded {len(df_normalized)} rows from raw logs ") perfspec['vars']['df_raw_data']=df_normalized load_raw_logs(perfspec['settings']['raw_audit_log']) mo.md( f""" ## {mo.icon('lucide:database', color="green")} Load raw logs into a Dataset Loading raw data logs from: {perfspec['settings']['raw_audit_log']} """ ) return (load_raw_logs,) @app.cell(hide_code=True) def create_main_audit_logs(Path, mo, perfspec): def create_main_audit_logs(df_normalized,outputfile): # List of fields to remove remove_fields = [ "apiVersion", "level", "sourceIPs", "kind", "annotations", "stageTimestamp", "userAgent" ] # List of fields to keep keep_fields = [ "requestReceivedTimestamp", "user_username", "verb", "objectRef_resource", "objectRef_subresource", "objectRef_name", "requestURI", "auditID", "stage", "responseStatus_code", "objectRef_uid", ] # Remove unwanted fields (drop them from the DataFrame if they exist) df_cleaned = df_normalized.drop(columns=[field for field in remove_fields if field in df_normalized.columns], errors='ignore') # Select only the fields you want to keep (ensure that they exist in the DataFrame) df_final = df_cleaned[keep_fields].copy() # Display the final DataFrame #print("Final DataFrame with only the selected fields:") #print(df_final.head()) # Define the output path for the JSON file output_file = Path(outputfile) if Path(output_file).exists(): output_file.unlink() # Write the DataFrame to JSON df_final.to_json(output_file, orient='records', lines=True, force_ascii=False) if perfspec['settings']['verbose'] != None or mo.running_in_notebook(): print(f"Main audit log created in {output_file}") create_main_audit_logs(perfspec['vars']['df_raw_data'], perfspec['settings']['main_audit_log']) mo.md( f""" ## {mo.icon('lucide:scroll', color="red")} Cleanup Dataset Create a main **audit log** as starting point for operations Final log will be in {perfspec['settings']['main_audit_log']} """ ) return (create_main_audit_logs,) @app.cell(hide_code=True) def genereate_actions_data(Path, mo, pd, perfspec): def generate_actions_data(filepath): file_path = Path(filepath) if not file_path.exists(): exit(f"File path: {filepath} not exists") df_actions = pd.read_json(file_path, orient='records', lines=True) names=['requestReceivedTimestamp', 'user_username', 'verb', 'objectRef_resource', 'objectRef_subresource', 'objectRef_name', 'requestURI', 'auditID', 'stage', 'responseStatus_code', "objectRef_uid"] # Assign the column names to the DataFrame df_actions.columns = names #print(df_actions.count) df_actions = df_actions.drop_duplicates(ignore_index=True) #df_actions = df_actions.drop(df_actions.columns[[]], axis=1) df_actions = df_actions[ (df_actions['verb'] != "get") & (df_actions['verb'] != "watch") & (df_actions['verb'] != "list") & (df_actions['objectRef_resource'] != "events") & (df_actions['objectRef_resource'] != "leases") ] df_actions = df_actions[df_actions.objectRef_resource != "replicationcontrollers"] df_actions["event_type"] = df_actions["verb"] + "_" + df_actions["objectRef_resource"] #df_actions = df_actions.drop_duplicates() #print(df_actions.to_string()) #print(df_actions.count) perfspec['vars']['df_actions_dataset'] = df_actions if perfspec['settings']['verbose'] != None or mo.running_in_notebook(): print(f"Main audit log prepared for actions data with: {df_actions.count()} rows") generate_actions_data(perfspec['settings']['main_audit_log']) mo.md( f""" ## {mo.icon('lucide:database', color="green")} Load audit logs for Actions Loading **main audit data logs** from: {perfspec['settings']['main_audit_log']} """ ) return (generate_actions_data,) @app.cell(hide_code=True) def save_actions_data(Path, mo, perfspec): def save_actions_data(df_audit_logs,filepath): file_path = Path(filepath) if file_path.exists(): #file_path.unlink() print(f"File path already exist: {filepath} DELETE to process") return event_seq = [] event_sub_seq = [] audit_filepath = Path(filepath) if Path(audit_filepath).exists(): audit_filepath.unlink() for c, r in df_audit_logs.iterrows(): with open(audit_filepath, "a") as event_file: event_file.write("%s\n" % r['event_type']) save_actions_data(perfspec['vars']['df_actions_dataset'],perfspec['settings']['actions_filepath']) mo.md( f""" ### {mo.icon('lucide:save', color="green")} Save Actions Data Save **actions data** logs in: {perfspec['settings']['actions_filepath']} > If file exists it will not be deleted or overwritted, basically as a **trainded models data source** """ ) return (save_actions_data,) @app.cell(hide_code=True) def plot_available_fonts(mo, perfspec): def available_fonts(): import matplotlib.font_manager # List all available fonts in Matplotlib available_fonts = [f.name for f in matplotlib.font_manager.fontManager.ttflist] return (available_fonts) if perfspec['settings']['verbose'] == 'dev': mo.md( f""" ### Matplot available fonts Fonts: {available_fonts()} """ ) return (available_fonts,) @app.cell def main(mo): mo.md("""""") return @app.cell(hide_code=True) def actions_distrib(mo, perfspec): mo.md( f""" # {mo.icon('lucide:chart-spline', color="orange")} Plot actions distribution Show how **Resources** and critical **methods** are concentrated or dristributes A distribution map is generated with plot graphic and saved from **accions-dataset**
to {perfspec['settings']['actions_distrib_filepath']}
usando formato {perfspec['settings']['actions_distrib_format']}
) """ ) return @app.cell(hide_code=True) def plot_actions_distrib(copy, mo, np, perfspec): def actions_distrib(filename, output_path, output_format): import re from collections import defaultdict, Counter from tqdm import trange, tqdm import time import matplotlib.pyplot as plt import matplotlib import textwrap font_size = 17 # Set the font to a specific one that you know is available on your system matplotlib.rcParams['font.family'] = 'DejaVu Serif' raw_text = open(filename, 'r', encoding='utf-8').read() raw_words = raw_text.replace('\n', ',').split(',')[:-1] def label_wrap(labels): work_labels = copy.deepcopy(labels) for i, label in enumerate(work_labels): work_labels[i] = "\n".join(textwrap.wrap(label,width=15)) return work_labels words = [] event_filter = [] # Example of filtering out some unwanted events for event in raw_words: if not('collection' in event): words.append(event) verbs = [] resources = [] counter = Counter(words) for word in words: verb, resource = word.split('_')[0], word.split('_')[1] verbs.append(verb) resources.append(resource) # verbs = verbs[:200] # resources = resources[:200] counter_verbs = Counter(verbs) counter_resources = Counter(resources) verbs_set = list(set(verbs)) resources_set = list(set(resources)) verbs_set.sort() resources_set.sort(reverse=True) verbs = [e for e in verbs_set for k in resources_set] resources = [e for k in verbs_set for e in resources_set] # color_verb = [counter[e+"_"+k] for e in verbs_set for k in resources_set] area_resource = [3*counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))] texts = [counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))] plt.rcParams.update({'font.size': font_size}) fig = plt.figure(figsize=(9, 9), dpi=100) ax=fig.add_subplot(111, label="1") ax.scatter(verbs, resources, s=area_resource, alpha=0.4, color='gray') ax.set_xlabel("Methods", fontsize=font_size) ax.set_xticks(verbs_set) ax.set_xticklabels(verbs_set, fontsize=font_size-4, linespacing=1.0) ax.xaxis.tick_bottom() ax.set_xlim(-0.5,3.5) ax.set_ylabel("Resources", fontsize=font_size) ax.set_yticks(resources_set) ax.set_yticklabels(resources_set, fontsize=font_size-8, linespacing=1.0) ax.yaxis.tick_left() for j in range(len(verbs)): if texts[j] > 20: plt.annotate(str(texts[j]), (verbs[j], resources[j]), ha='center', va='center', fontsize=np.interp(texts[j], [20, 2234], [8, 20])) plt.rcParams['grid.linestyle'] = 'dotted' plt.rcParams['grid.alpha'] = 0.3 plt.grid() # plt.show() fig.savefig(output_path, format=output_format, bbox_inches="tight") perfspec['vars']['actions_distrib'] = counter return plt _plt = actions_distrib( perfspec['settings']['actions_filepath'], perfspec['settings']['actions_distrib_filepath'], perfspec['settings']['actions_distrib_format'] ) if 'actions_distrib' in perfspec['vars'] and len(perfspec['vars']['actions_distrib'].items()) > 0: mo.md( f""" ## Plot actions distribution {mo.as_html(_plt.show())} ## Plot actions distribution """ ) return (actions_distrib,) @app.cell(hide_code=True) def review_actions_distrib(mo, pd, perfspec): def df_actions_table(): if len(perfspec['vars']['actions_distrib'].items()) > 0: df = pd.DataFrame(perfspec['vars']['actions_distrib'].items(), columns=['Action', 'Count']) count_filter = mo.ui.slider(start=0, stop=100, value=50, label="Max age") count_filter #_transform_df = mo.ui.dataframe(_df) transform_df = mo.ui.table(df,selection="multi") return transform_df else: return None transform_df = df_actions_table() mo.md( f""" ## Review actions distribution {transform_df} """ ) return df_actions_table, transform_df @app.cell(hide_code=True) def select_actions_distrib(mo, transform_df): mo.md( f""" ## Select actions distribution {mo.md(f"Selecte value: {mo.ui.table(transform_df.value)}")} """ ) return @app.cell def _(): return if __name__ == "__main__": app.run()