From 62e0f1f952cea0f745b25052210e3e62a37afeb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jes=C3=BAs=20P=C3=A9rez=20Lorenzo?=
 <jesusperezlorenzo@rigzin.local>
Date: Mon, 27 Jan 2025 00:50:17 +0000
Subject: [PATCH] chore: add prepare data notebook marimo

---
 learning/python/prepare_perfspec.py | 452 ++++++++++++++++++++++++++++
 1 file changed, 452 insertions(+)
 create mode 100644 learning/python/prepare_perfspec.py

diff --git a/learning/python/prepare_perfspec.py b/learning/python/prepare_perfspec.py
new file mode 100644
index 0000000..1447a02
--- /dev/null
+++ b/learning/python/prepare_perfspec.py
@@ -0,0 +1,452 @@
+import marimo
+
+__generated_with = "0.10.16"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def header():
+    import marimo as mo
+
+    notebook_name = 'prepare_perfspec.py'
+
+    from lib_perfspec import perfspec_vars
+    (_,_defs) = perfspec_vars.run()
+    perfspec = _defs['perfspec']
+
+    from lib_perfspec import perfspec_header
+    (_,_defs) = perfspec_header.run()
+    lib_header = _defs['header']
+    lib_intro = _defs['intro']
+
+    mo.md(
+        f"""
+        {lib_header(notebook_name)}
+
+        ## Prepare data to train **{perfspec['app']['train_mode']}**  model
+        """
+    )
+    return (
+        lib_header,
+        lib_intro,
+        mo,
+        notebook_name,
+        perfspec,
+        perfspec_header,
+        perfspec_vars,
+    )
+
+
+@app.cell
+def imports():
+    from pathlib import Path
+    import numpy as np
+    import json
+    import pandas as pd
+    return Path, json, np, pd
+
+
+@app.cell(hide_code=True)
+def intro(Path, lib_intro, mo, notebook_name, perfspec):
+    verbose = perfspec['settings']['verbose']
+    perfspec['vars'] = {}
+
+    from lib_perfspec import perfspec_args
+    (_,_defs) = perfspec_args.run()
+
+    if not Path(perfspec['defaults']['data_dirpath']).exists(): 
+       exit(f"data dir path not found: {perfspec['defaults']['data_dirpath']}")  
+
+    mo.md(
+        f"""
+        {lib_intro(notebook_name)}
+
+        """
+    )
+    return perfspec_args, verbose
+
+
+@app.cell(hide_code=True)
+def load_raw_logs(Path, mo, pd, perfspec):
+    def load_raw_logs(filepath):
+        file_path = Path(filepath)
+
+        # Check if the file exists using Path
+        if not file_path.exists():
+           exit(f"File not found: {filepath}")
+
+        # Set the chunk size (number of rows to process at a time)
+        chunk_size = 1000  # Adjust based on your available memory
+
+        # Create an empty list to hold the chunks
+        chunks = []
+
+        # Iterate over the file in chunks
+        try:
+            for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size, encoding_errors='ignore'):
+                # Append each chunk to the list
+                chunks.append(chunk)
+        except ValueError as e:
+            print(f"Error while parsing JSON: {e}")
+
+        # Combine all chunks into a single DataFrame
+        df = pd.concat(chunks, ignore_index=True)
+
+        #df['user_parsed'] = df['user'].apply(json.loads)
+        #df_exploded = df.explode('user')
+
+        # Normalize the JSON structure to flatten it
+        df_normalized = pd.json_normalize(
+            df.to_dict(orient='records'),  # Convert the DataFrame to a list of records
+            sep='_'
+        )
+        if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
+            print (f"Loaded {len(df_normalized)} rows from raw logs ") 
+        perfspec['vars']['df_raw_data']=df_normalized
+
+    load_raw_logs(perfspec['settings']['raw_audit_log'])
+
+    mo.md(
+        f"""
+        ##  {mo.icon('lucide:database', color="green")} Load raw logs into a Dataset
+
+        Loading raw data logs from: {perfspec['settings']['raw_audit_log']}
+        """
+    )
+    return (load_raw_logs,)
+
+
+@app.cell(hide_code=True)
+def create_main_audit_logs(Path, mo, perfspec):
+    def create_main_audit_logs(df_normalized,outputfile):
+        # List of fields to remove
+        remove_fields = [
+            "apiVersion", "level", "sourceIPs", "kind", 
+            "annotations", "stageTimestamp", "userAgent"
+        ]
+
+        # List of fields to keep
+        keep_fields = [
+            "requestReceivedTimestamp", "user_username", "verb", 
+            "objectRef_resource", "objectRef_subresource", "objectRef_name", 
+            "requestURI", "auditID", "stage", 
+            "responseStatus_code", "objectRef_uid",
+        ]
+
+        # Remove unwanted fields (drop them from the DataFrame if they exist)
+        df_cleaned = df_normalized.drop(columns=[field for field in remove_fields if field in df_normalized.columns], errors='ignore')
+
+        # Select only the fields you want to keep (ensure that they exist in the DataFrame)
+        df_final = df_cleaned[keep_fields].copy()
+
+        # Display the final DataFrame
+        #print("Final DataFrame with only the selected fields:")
+        #print(df_final.head())
+
+        # Define the output path for the JSON file
+        output_file = Path(outputfile)
+
+        if Path(output_file).exists():
+           output_file.unlink()
+
+        # Write the DataFrame to JSON
+        df_final.to_json(output_file, orient='records', lines=True, force_ascii=False)
+        if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
+            print(f"Main audit log created in {output_file}")
+
+    create_main_audit_logs(perfspec['vars']['df_raw_data'], perfspec['settings']['main_audit_log'])
+
+    mo.md(
+        f"""
+        ##  {mo.icon('lucide:scroll', color="red")} Cleanup Dataset
+
+        Create a  main **audit log** as starting point for operations
+
+        Final log will be in {perfspec['settings']['main_audit_log']}
+        """
+    )
+    return (create_main_audit_logs,)
+
+
+@app.cell(hide_code=True)
+def genereate_actions_data(Path, mo, pd, perfspec):
+    def generate_actions_data(filepath):
+        file_path = Path(filepath)
+        if not file_path.exists():
+            exit(f"File path: {filepath} not exists")
+
+        df_actions = pd.read_json(file_path, orient='records', lines=True)
+
+        names=['requestReceivedTimestamp', 'user_username', 'verb',
+               'objectRef_resource', 'objectRef_subresource', 'objectRef_name',
+               'requestURI', 'auditID', 'stage', 'responseStatus_code', "objectRef_uid"]
+
+        # Assign the column names to the DataFrame
+        df_actions.columns = names
+
+        #print(df_actions.count)
+        df_actions = df_actions.drop_duplicates(ignore_index=True)
+
+        #df_actions = df_actions.drop(df_actions.columns[[]], axis=1)
+        df_actions = df_actions[
+                     (df_actions['verb'] != "get") &
+                     (df_actions['verb'] != "watch") &
+                     (df_actions['verb'] != "list") &
+                     (df_actions['objectRef_resource'] != "events") &
+                     (df_actions['objectRef_resource'] != "leases")
+                     ]
+
+        df_actions = df_actions[df_actions.objectRef_resource != "replicationcontrollers"]
+
+        df_actions["event_type"] = df_actions["verb"] + "_" + df_actions["objectRef_resource"]
+
+        #df_actions = df_actions.drop_duplicates()
+        #print(df_actions.to_string())
+        #print(df_actions.count)
+
+        perfspec['vars']['df_actions_dataset'] = df_actions
+        if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
+            print(f"Main audit log prepared for actions data with: {df_actions.count()} rows")
+
+    generate_actions_data(perfspec['settings']['main_audit_log'])
+    mo.md(
+        f"""
+        ##  {mo.icon('lucide:database', color="green")} Load audit logs for Actions
+
+        Loading **main audit data logs** from: {perfspec['settings']['main_audit_log']}
+        """
+    )
+    return (generate_actions_data,)
+
+
+@app.cell(hide_code=True)
+def save_actions_data(Path, mo, perfspec):
+    def save_actions_data(df_audit_logs,filepath):
+        file_path = Path(filepath)
+        if file_path.exists():
+            #file_path.unlink()
+            print(f"File path already exist: {filepath} DELETE to process")
+            return 
+
+        event_seq = []
+        event_sub_seq = []
+
+        audit_filepath = Path(filepath)
+        if Path(audit_filepath).exists():
+           audit_filepath.unlink()
+        for c, r in df_audit_logs.iterrows():
+            with open(audit_filepath, "a") as event_file:
+                event_file.write("%s\n" % r['event_type'])
+
+    save_actions_data(perfspec['vars']['df_actions_dataset'],perfspec['settings']['actions_filepath'])
+    mo.md(
+        f"""
+        ###  {mo.icon('lucide:save', color="green")} Save Actions Data 
+
+        Save **actions data** logs in: {perfspec['settings']['actions_filepath']}
+
+        > If file exists it will not be deleted or overwritted, basically as a **trainded models data source** 
+
+        """
+    )
+    return (save_actions_data,)
+
+
+@app.cell(hide_code=True)
+def plot_available_fonts(mo, perfspec):
+    def available_fonts():
+        import matplotlib.font_manager
+
+        # List all available fonts in Matplotlib
+        available_fonts = [f.name for f in matplotlib.font_manager.fontManager.ttflist]
+        return (available_fonts)
+
+    if perfspec['settings']['verbose'] == 'dev':
+        mo.md(
+                f"""
+                ### Matplot available fonts
+
+                Fonts: {available_fonts()}
+
+                """
+        )
+    return (available_fonts,)
+
+
+@app.cell(hide_code=True)
+def actions_distrib(mo, perfspec):
+    mo.md(
+        f"""
+        #  {mo.icon('lucide:chart-spline', color="orange")}  Plot actions distribution
+
+        Show how **Resources** and critical **methods** are concentrated or dristributes
+
+        A distribution map is generated with plot graphic and saved from **accions-dataset**<br>
+            to  {perfspec['settings']['actions_distrib_filepath']} <br>
+            usando formato {perfspec['settings']['actions_distrib_format']} <br>
+        )
+        """
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def plot_actions_distrib(copy, mo, np, perfspec):
+    def actions_distrib(filename, output_path, output_format):
+        import re
+        from collections import defaultdict, Counter
+        from tqdm import trange, tqdm
+        import time
+        import matplotlib.pyplot as plt
+        import matplotlib
+        import textwrap
+
+        font_size = 17
+
+        # Set the font to a specific one that you know is available on your system
+        matplotlib.rcParams['font.family'] = 'DejaVu Serif'
+
+        raw_text = open(filename, 'r', encoding='utf-8').read()
+        raw_words = raw_text.replace('\n', ',').split(',')[:-1]
+
+        def label_wrap(labels):
+            work_labels = copy.deepcopy(labels)
+            for i, label in enumerate(work_labels):
+                work_labels[i] = "\n".join(textwrap.wrap(label,width=15))
+            return work_labels
+
+        words = []
+        event_filter = []
+        # Example of filtering out some unwanted events
+        for event in raw_words:
+            if not('collection' in event):
+                words.append(event)
+
+        verbs = []
+        resources = []
+
+        counter = Counter(words)
+
+        for word in words:
+            verb, resource = word.split('_')[0], word.split('_')[1]
+            verbs.append(verb)
+            resources.append(resource)
+
+        # verbs = verbs[:200]
+        # resources = resources[:200]
+
+        counter_verbs = Counter(verbs)
+        counter_resources = Counter(resources)
+
+        verbs_set = list(set(verbs))
+        resources_set = list(set(resources))
+
+        verbs_set.sort()
+        resources_set.sort(reverse=True)
+
+        verbs = [e for e in verbs_set for k in resources_set]
+        resources = [e for k in verbs_set for e in resources_set]
+
+        # color_verb = [counter[e+"_"+k] for e in verbs_set for k in resources_set]
+        area_resource = [3*counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))]
+        texts = [counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))]
+
+        plt.rcParams.update({'font.size': font_size})
+
+        fig = plt.figure(figsize=(9, 9), dpi=100)
+
+        ax=fig.add_subplot(111, label="1")
+
+        ax.scatter(verbs, resources, s=area_resource, alpha=0.4, color='gray')
+
+
+        ax.set_xlabel("Methods", fontsize=font_size)
+        ax.set_xticks(verbs_set)
+        ax.set_xticklabels(verbs_set, fontsize=font_size-4, linespacing=1.0)
+        ax.xaxis.tick_bottom()
+        ax.set_xlim(-0.5,3.5)
+
+        ax.set_ylabel("Resources", fontsize=font_size)
+        ax.set_yticks(resources_set)
+        ax.set_yticklabels(resources_set, fontsize=font_size-8, linespacing=1.0)
+        ax.yaxis.tick_left()
+
+        for j in range(len(verbs)):
+            if texts[j] > 20:
+                plt.annotate(str(texts[j]), (verbs[j], resources[j]), ha='center', va='center', fontsize=np.interp(texts[j], [20, 2234], [8, 20]))
+
+
+        plt.rcParams['grid.linestyle'] = 'dotted'
+        plt.rcParams['grid.alpha'] = 0.3
+
+        plt.grid()
+       # plt.show()
+        fig.savefig(output_path, format=output_format, bbox_inches="tight")
+        perfspec['vars']['actions_distrib'] = counter
+        return plt
+
+    _plt = actions_distrib(
+            perfspec['settings']['actions_filepath'],
+            perfspec['settings']['actions_distrib_filepath'],
+            perfspec['settings']['actions_distrib_format']
+        )
+
+
+    if 'actions_distrib' in perfspec['vars'] and len(perfspec['vars']['actions_distrib'].items()) > 0:
+        mo.md(
+            f"""
+
+            ## Plot actions distribution
+
+            {mo.as_html(_plt.show())}
+
+            ## Plot actions distribution
+
+            """
+        )
+    return (actions_distrib,)
+
+
+@app.cell(hide_code=True)
+def review_actions_distrib(mo, pd, perfspec):
+    def df_actions_table():
+        if len(perfspec['vars']['actions_distrib'].items()) > 0:
+            df = pd.DataFrame(perfspec['vars']['actions_distrib'].items(), columns=['Action', 'Count'])
+            count_filter = mo.ui.slider(start=0, stop=100, value=50, label="Max age")
+            count_filter
+            #_transform_df = mo.ui.dataframe(_df)
+            transform_df = mo.ui.table(df,selection="multi")
+            return transform_df
+        else:
+            return None
+
+    transform_df = df_actions_table()
+    mo.md(
+        f"""
+
+        ## Review actions distribution
+
+        {transform_df}
+        """
+    )
+    return df_actions_table, transform_df
+
+
+@app.cell(hide_code=True)
+def select_actions_distrib(mo, transform_df):
+    mo.md(
+        f"""
+        ## Select actions distribution
+        {mo.md(f"Selecte value: {mo.ui.table(transform_df.value)}")}
+        """
+    )
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()