From 62e0f1f952cea0f745b25052210e3e62a37afeb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20P=C3=A9rez=20Lorenzo?= Date: Mon, 27 Jan 2025 00:50:17 +0000 Subject: [PATCH] chore: add prepare data notebook marimo --- learning/python/prepare_perfspec.py | 452 ++++++++++++++++++++++++++++ 1 file changed, 452 insertions(+) create mode 100644 learning/python/prepare_perfspec.py diff --git a/learning/python/prepare_perfspec.py b/learning/python/prepare_perfspec.py new file mode 100644 index 0000000..1447a02 --- /dev/null +++ b/learning/python/prepare_perfspec.py @@ -0,0 +1,452 @@ +import marimo + +__generated_with = "0.10.16" +app = marimo.App(width="medium") + + +@app.cell(hide_code=True) +def header(): + import marimo as mo + + notebook_name = 'prepare_perfspec.py' + + from lib_perfspec import perfspec_vars + (_,_defs) = perfspec_vars.run() + perfspec = _defs['perfspec'] + + from lib_perfspec import perfspec_header + (_,_defs) = perfspec_header.run() + lib_header = _defs['header'] + lib_intro = _defs['intro'] + + mo.md( + f""" + {lib_header(notebook_name)} + + ## Prepare data to train **{perfspec['app']['train_mode']}** model + """ + ) + return ( + lib_header, + lib_intro, + mo, + notebook_name, + perfspec, + perfspec_header, + perfspec_vars, + ) + + +@app.cell +def imports(): + from pathlib import Path + import numpy as np + import json + import pandas as pd + return Path, json, np, pd + + +@app.cell(hide_code=True) +def intro(Path, lib_intro, mo, notebook_name, perfspec): + verbose = perfspec['settings']['verbose'] + perfspec['vars'] = {} + + from lib_perfspec import perfspec_args + (_,_defs) = perfspec_args.run() + + if not Path(perfspec['defaults']['data_dirpath']).exists(): + exit(f"data dir path not found: {perfspec['defaults']['data_dirpath']}") + + mo.md( + f""" + {lib_intro(notebook_name)} + + """ + ) + return perfspec_args, verbose + + +@app.cell(hide_code=True) +def load_raw_logs(Path, mo, pd, perfspec): + def load_raw_logs(filepath): + file_path = Path(filepath) + + # Check if the file exists using Path + if not file_path.exists(): + exit(f"File not found: {filepath}") + + # Set the chunk size (number of rows to process at a time) + chunk_size = 1000 # Adjust based on your available memory + + # Create an empty list to hold the chunks + chunks = [] + + # Iterate over the file in chunks + try: + for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size, encoding_errors='ignore'): + # Append each chunk to the list + chunks.append(chunk) + except ValueError as e: + print(f"Error while parsing JSON: {e}") + + # Combine all chunks into a single DataFrame + df = pd.concat(chunks, ignore_index=True) + + #df['user_parsed'] = df['user'].apply(json.loads) + #df_exploded = df.explode('user') + + # Normalize the JSON structure to flatten it + df_normalized = pd.json_normalize( + df.to_dict(orient='records'), # Convert the DataFrame to a list of records + sep='_' + ) + if perfspec['settings']['verbose'] != None or mo.running_in_notebook(): + print (f"Loaded {len(df_normalized)} rows from raw logs ") + perfspec['vars']['df_raw_data']=df_normalized + + load_raw_logs(perfspec['settings']['raw_audit_log']) + + mo.md( + f""" + ## {mo.icon('lucide:database', color="green")} Load raw logs into a Dataset + + Loading raw data logs from: {perfspec['settings']['raw_audit_log']} + """ + ) + return (load_raw_logs,) + + +@app.cell(hide_code=True) +def create_main_audit_logs(Path, mo, perfspec): + def create_main_audit_logs(df_normalized,outputfile): + # List of fields to remove + remove_fields = [ + "apiVersion", "level", "sourceIPs", "kind", + "annotations", "stageTimestamp", "userAgent" + ] + + # List of fields to keep + keep_fields = [ + "requestReceivedTimestamp", "user_username", "verb", + "objectRef_resource", "objectRef_subresource", "objectRef_name", + "requestURI", "auditID", "stage", + "responseStatus_code", "objectRef_uid", + ] + + # Remove unwanted fields (drop them from the DataFrame if they exist) + df_cleaned = df_normalized.drop(columns=[field for field in remove_fields if field in df_normalized.columns], errors='ignore') + + # Select only the fields you want to keep (ensure that they exist in the DataFrame) + df_final = df_cleaned[keep_fields].copy() + + # Display the final DataFrame + #print("Final DataFrame with only the selected fields:") + #print(df_final.head()) + + # Define the output path for the JSON file + output_file = Path(outputfile) + + if Path(output_file).exists(): + output_file.unlink() + + # Write the DataFrame to JSON + df_final.to_json(output_file, orient='records', lines=True, force_ascii=False) + if perfspec['settings']['verbose'] != None or mo.running_in_notebook(): + print(f"Main audit log created in {output_file}") + + create_main_audit_logs(perfspec['vars']['df_raw_data'], perfspec['settings']['main_audit_log']) + + mo.md( + f""" + ## {mo.icon('lucide:scroll', color="red")} Cleanup Dataset + + Create a main **audit log** as starting point for operations + + Final log will be in {perfspec['settings']['main_audit_log']} + """ + ) + return (create_main_audit_logs,) + + +@app.cell(hide_code=True) +def genereate_actions_data(Path, mo, pd, perfspec): + def generate_actions_data(filepath): + file_path = Path(filepath) + if not file_path.exists(): + exit(f"File path: {filepath} not exists") + + df_actions = pd.read_json(file_path, orient='records', lines=True) + + names=['requestReceivedTimestamp', 'user_username', 'verb', + 'objectRef_resource', 'objectRef_subresource', 'objectRef_name', + 'requestURI', 'auditID', 'stage', 'responseStatus_code', "objectRef_uid"] + + # Assign the column names to the DataFrame + df_actions.columns = names + + #print(df_actions.count) + df_actions = df_actions.drop_duplicates(ignore_index=True) + + #df_actions = df_actions.drop(df_actions.columns[[]], axis=1) + df_actions = df_actions[ + (df_actions['verb'] != "get") & + (df_actions['verb'] != "watch") & + (df_actions['verb'] != "list") & + (df_actions['objectRef_resource'] != "events") & + (df_actions['objectRef_resource'] != "leases") + ] + + df_actions = df_actions[df_actions.objectRef_resource != "replicationcontrollers"] + + df_actions["event_type"] = df_actions["verb"] + "_" + df_actions["objectRef_resource"] + + #df_actions = df_actions.drop_duplicates() + #print(df_actions.to_string()) + #print(df_actions.count) + + perfspec['vars']['df_actions_dataset'] = df_actions + if perfspec['settings']['verbose'] != None or mo.running_in_notebook(): + print(f"Main audit log prepared for actions data with: {df_actions.count()} rows") + + generate_actions_data(perfspec['settings']['main_audit_log']) + mo.md( + f""" + ## {mo.icon('lucide:database', color="green")} Load audit logs for Actions + + Loading **main audit data logs** from: {perfspec['settings']['main_audit_log']} + """ + ) + return (generate_actions_data,) + + +@app.cell(hide_code=True) +def save_actions_data(Path, mo, perfspec): + def save_actions_data(df_audit_logs,filepath): + file_path = Path(filepath) + if file_path.exists(): + #file_path.unlink() + print(f"File path already exist: {filepath} DELETE to process") + return + + event_seq = [] + event_sub_seq = [] + + audit_filepath = Path(filepath) + if Path(audit_filepath).exists(): + audit_filepath.unlink() + for c, r in df_audit_logs.iterrows(): + with open(audit_filepath, "a") as event_file: + event_file.write("%s\n" % r['event_type']) + + save_actions_data(perfspec['vars']['df_actions_dataset'],perfspec['settings']['actions_filepath']) + mo.md( + f""" + ### {mo.icon('lucide:save', color="green")} Save Actions Data + + Save **actions data** logs in: {perfspec['settings']['actions_filepath']} + + > If file exists it will not be deleted or overwritted, basically as a **trainded models data source** + + """ + ) + return (save_actions_data,) + + +@app.cell(hide_code=True) +def plot_available_fonts(mo, perfspec): + def available_fonts(): + import matplotlib.font_manager + + # List all available fonts in Matplotlib + available_fonts = [f.name for f in matplotlib.font_manager.fontManager.ttflist] + return (available_fonts) + + if perfspec['settings']['verbose'] == 'dev': + mo.md( + f""" + ### Matplot available fonts + + Fonts: {available_fonts()} + + """ + ) + return (available_fonts,) + + +@app.cell(hide_code=True) +def actions_distrib(mo, perfspec): + mo.md( + f""" + # {mo.icon('lucide:chart-spline', color="orange")} Plot actions distribution + + Show how **Resources** and critical **methods** are concentrated or dristributes + + A distribution map is generated with plot graphic and saved from **accions-dataset**
+ to {perfspec['settings']['actions_distrib_filepath']}
+ usando formato {perfspec['settings']['actions_distrib_format']}
+ ) + """ + ) + return + + +@app.cell(hide_code=True) +def plot_actions_distrib(copy, mo, np, perfspec): + def actions_distrib(filename, output_path, output_format): + import re + from collections import defaultdict, Counter + from tqdm import trange, tqdm + import time + import matplotlib.pyplot as plt + import matplotlib + import textwrap + + font_size = 17 + + # Set the font to a specific one that you know is available on your system + matplotlib.rcParams['font.family'] = 'DejaVu Serif' + + raw_text = open(filename, 'r', encoding='utf-8').read() + raw_words = raw_text.replace('\n', ',').split(',')[:-1] + + def label_wrap(labels): + work_labels = copy.deepcopy(labels) + for i, label in enumerate(work_labels): + work_labels[i] = "\n".join(textwrap.wrap(label,width=15)) + return work_labels + + words = [] + event_filter = [] + # Example of filtering out some unwanted events + for event in raw_words: + if not('collection' in event): + words.append(event) + + verbs = [] + resources = [] + + counter = Counter(words) + + for word in words: + verb, resource = word.split('_')[0], word.split('_')[1] + verbs.append(verb) + resources.append(resource) + + # verbs = verbs[:200] + # resources = resources[:200] + + counter_verbs = Counter(verbs) + counter_resources = Counter(resources) + + verbs_set = list(set(verbs)) + resources_set = list(set(resources)) + + verbs_set.sort() + resources_set.sort(reverse=True) + + verbs = [e for e in verbs_set for k in resources_set] + resources = [e for k in verbs_set for e in resources_set] + + # color_verb = [counter[e+"_"+k] for e in verbs_set for k in resources_set] + area_resource = [3*counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))] + texts = [counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))] + + plt.rcParams.update({'font.size': font_size}) + + fig = plt.figure(figsize=(9, 9), dpi=100) + + ax=fig.add_subplot(111, label="1") + + ax.scatter(verbs, resources, s=area_resource, alpha=0.4, color='gray') + + + ax.set_xlabel("Methods", fontsize=font_size) + ax.set_xticks(verbs_set) + ax.set_xticklabels(verbs_set, fontsize=font_size-4, linespacing=1.0) + ax.xaxis.tick_bottom() + ax.set_xlim(-0.5,3.5) + + ax.set_ylabel("Resources", fontsize=font_size) + ax.set_yticks(resources_set) + ax.set_yticklabels(resources_set, fontsize=font_size-8, linespacing=1.0) + ax.yaxis.tick_left() + + for j in range(len(verbs)): + if texts[j] > 20: + plt.annotate(str(texts[j]), (verbs[j], resources[j]), ha='center', va='center', fontsize=np.interp(texts[j], [20, 2234], [8, 20])) + + + plt.rcParams['grid.linestyle'] = 'dotted' + plt.rcParams['grid.alpha'] = 0.3 + + plt.grid() + # plt.show() + fig.savefig(output_path, format=output_format, bbox_inches="tight") + perfspec['vars']['actions_distrib'] = counter + return plt + + _plt = actions_distrib( + perfspec['settings']['actions_filepath'], + perfspec['settings']['actions_distrib_filepath'], + perfspec['settings']['actions_distrib_format'] + ) + + + if 'actions_distrib' in perfspec['vars'] and len(perfspec['vars']['actions_distrib'].items()) > 0: + mo.md( + f""" + + ## Plot actions distribution + + {mo.as_html(_plt.show())} + + ## Plot actions distribution + + """ + ) + return (actions_distrib,) + + +@app.cell(hide_code=True) +def review_actions_distrib(mo, pd, perfspec): + def df_actions_table(): + if len(perfspec['vars']['actions_distrib'].items()) > 0: + df = pd.DataFrame(perfspec['vars']['actions_distrib'].items(), columns=['Action', 'Count']) + count_filter = mo.ui.slider(start=0, stop=100, value=50, label="Max age") + count_filter + #_transform_df = mo.ui.dataframe(_df) + transform_df = mo.ui.table(df,selection="multi") + return transform_df + else: + return None + + transform_df = df_actions_table() + mo.md( + f""" + + ## Review actions distribution + + {transform_df} + """ + ) + return df_actions_table, transform_df + + +@app.cell(hide_code=True) +def select_actions_distrib(mo, transform_df): + mo.md( + f""" + ## Select actions distribution + {mo.md(f"Selecte value: {mo.ui.table(transform_df.value)}")} + """ + ) + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run()