459 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			459 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import marimo
 | 
						|
 | 
						|
__generated_with = "0.10.17"
 | 
						|
app = marimo.App(width="medium")
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def header():
 | 
						|
    import marimo as mo
 | 
						|
 | 
						|
    notebook_name = 'prepare_perfspec.py'
 | 
						|
 | 
						|
    from lib_perfspec import perfspec_vars
 | 
						|
    (_,_defs) = perfspec_vars.run()
 | 
						|
    perfspec = _defs['perfspec']
 | 
						|
 | 
						|
    from lib_perfspec import perfspec_header
 | 
						|
    (_,_defs) = perfspec_header.run()
 | 
						|
    lib_header = _defs['header']
 | 
						|
    lib_intro = _defs['intro']
 | 
						|
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        {lib_header(notebook_name)}
 | 
						|
 | 
						|
        ## Prepare data to train **{perfspec['app']['train_mode']}**  model
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return (
 | 
						|
        lib_header,
 | 
						|
        lib_intro,
 | 
						|
        mo,
 | 
						|
        notebook_name,
 | 
						|
        perfspec,
 | 
						|
        perfspec_header,
 | 
						|
        perfspec_vars,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def imports():
 | 
						|
    from pathlib import Path
 | 
						|
    import numpy as np
 | 
						|
    import json
 | 
						|
    import pandas as pd
 | 
						|
    return Path, json, np, pd
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def intro(Path, lib_intro, mo, notebook_name, perfspec):
 | 
						|
    verbose = perfspec['settings']['verbose']
 | 
						|
    perfspec['vars'] = {}
 | 
						|
 | 
						|
    from lib_perfspec import perfspec_args
 | 
						|
    (_,_defs) = perfspec_args.run()
 | 
						|
 | 
						|
    if not Path(perfspec['defaults']['data_dirpath']).exists(): 
 | 
						|
       exit(f"data dir path not found: {perfspec['defaults']['data_dirpath']}")  
 | 
						|
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        {lib_intro(notebook_name)}
 | 
						|
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return perfspec_args, verbose
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def load_raw_logs(Path, mo, pd, perfspec):
 | 
						|
    def load_raw_logs(filepath):
 | 
						|
        file_path = Path(filepath)
 | 
						|
 | 
						|
        # Check if the file exists using Path
 | 
						|
        if not file_path.exists():
 | 
						|
           exit(f"File not found: {filepath}")
 | 
						|
 | 
						|
        # Set the chunk size (number of rows to process at a time)
 | 
						|
        chunk_size = 1000  # Adjust based on your available memory
 | 
						|
 | 
						|
        # Create an empty list to hold the chunks
 | 
						|
        chunks = []
 | 
						|
 | 
						|
        # Iterate over the file in chunks
 | 
						|
        try:
 | 
						|
            for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size, encoding_errors='ignore'):
 | 
						|
                # Append each chunk to the list
 | 
						|
                chunks.append(chunk)
 | 
						|
        except ValueError as e:
 | 
						|
            print(f"Error while parsing JSON: {e}")
 | 
						|
 | 
						|
        # Combine all chunks into a single DataFrame
 | 
						|
        df = pd.concat(chunks, ignore_index=True)
 | 
						|
 | 
						|
        #df['user_parsed'] = df['user'].apply(json.loads)
 | 
						|
        #df_exploded = df.explode('user')
 | 
						|
 | 
						|
        # Normalize the JSON structure to flatten it
 | 
						|
        df_normalized = pd.json_normalize(
 | 
						|
            df.to_dict(orient='records'),  # Convert the DataFrame to a list of records
 | 
						|
            sep='_'
 | 
						|
        )
 | 
						|
        if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
 | 
						|
            print (f"Loaded {len(df_normalized)} rows from raw logs ") 
 | 
						|
        perfspec['vars']['df_raw_data']=df_normalized
 | 
						|
 | 
						|
    load_raw_logs(perfspec['settings']['raw_audit_log'])
 | 
						|
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        ##  {mo.icon('lucide:database', color="green")} Load raw logs into a Dataset
 | 
						|
 | 
						|
        Loading raw data logs from: {perfspec['settings']['raw_audit_log']}
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return (load_raw_logs,)
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def create_main_audit_logs(Path, mo, perfspec):
 | 
						|
    def create_main_audit_logs(df_normalized,outputfile):
 | 
						|
        # List of fields to remove
 | 
						|
        remove_fields = [
 | 
						|
            "apiVersion", "level", "sourceIPs", "kind", 
 | 
						|
            "annotations", "stageTimestamp", "userAgent"
 | 
						|
        ]
 | 
						|
 | 
						|
        # List of fields to keep
 | 
						|
        keep_fields = [
 | 
						|
            "requestReceivedTimestamp", "user_username", "verb", 
 | 
						|
            "objectRef_resource", "objectRef_subresource", "objectRef_name", 
 | 
						|
            "requestURI", "auditID", "stage", 
 | 
						|
            "responseStatus_code", "objectRef_uid",
 | 
						|
        ]
 | 
						|
 | 
						|
        # Remove unwanted fields (drop them from the DataFrame if they exist)
 | 
						|
        df_cleaned = df_normalized.drop(columns=[field for field in remove_fields if field in df_normalized.columns], errors='ignore')
 | 
						|
 | 
						|
        # Select only the fields you want to keep (ensure that they exist in the DataFrame)
 | 
						|
        df_final = df_cleaned[keep_fields].copy()
 | 
						|
 | 
						|
        # Display the final DataFrame
 | 
						|
        #print("Final DataFrame with only the selected fields:")
 | 
						|
        #print(df_final.head())
 | 
						|
 | 
						|
        # Define the output path for the JSON file
 | 
						|
        output_file = Path(outputfile)
 | 
						|
 | 
						|
        if Path(output_file).exists():
 | 
						|
           output_file.unlink()
 | 
						|
 | 
						|
        # Write the DataFrame to JSON
 | 
						|
        df_final.to_json(output_file, orient='records', lines=True, force_ascii=False)
 | 
						|
        if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
 | 
						|
            print(f"Main audit log created in {output_file}")
 | 
						|
 | 
						|
    create_main_audit_logs(perfspec['vars']['df_raw_data'], perfspec['settings']['main_audit_log'])
 | 
						|
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        ##  {mo.icon('lucide:scroll', color="red")} Cleanup Dataset
 | 
						|
 | 
						|
        Create a  main **audit log** as starting point for operations
 | 
						|
 | 
						|
        Final log will be in {perfspec['settings']['main_audit_log']}
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return (create_main_audit_logs,)
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def genereate_actions_data(Path, mo, pd, perfspec):
 | 
						|
    def generate_actions_data(filepath):
 | 
						|
        file_path = Path(filepath)
 | 
						|
        if not file_path.exists():
 | 
						|
            exit(f"File path: {filepath} not exists")
 | 
						|
 | 
						|
        df_actions = pd.read_json(file_path, orient='records', lines=True)
 | 
						|
 | 
						|
        names=['requestReceivedTimestamp', 'user_username', 'verb',
 | 
						|
               'objectRef_resource', 'objectRef_subresource', 'objectRef_name',
 | 
						|
               'requestURI', 'auditID', 'stage', 'responseStatus_code', "objectRef_uid"]
 | 
						|
 | 
						|
        # Assign the column names to the DataFrame
 | 
						|
        df_actions.columns = names
 | 
						|
 | 
						|
        #print(df_actions.count)
 | 
						|
        df_actions = df_actions.drop_duplicates(ignore_index=True)
 | 
						|
 | 
						|
        #df_actions = df_actions.drop(df_actions.columns[[]], axis=1)
 | 
						|
        df_actions = df_actions[
 | 
						|
                     (df_actions['verb'] != "get") &
 | 
						|
                     (df_actions['verb'] != "watch") &
 | 
						|
                     (df_actions['verb'] != "list") &
 | 
						|
                     (df_actions['objectRef_resource'] != "events") &
 | 
						|
                     (df_actions['objectRef_resource'] != "leases")
 | 
						|
                     ]
 | 
						|
 | 
						|
        df_actions = df_actions[df_actions.objectRef_resource != "replicationcontrollers"]
 | 
						|
 | 
						|
        df_actions["event_type"] = df_actions["verb"] + "_" + df_actions["objectRef_resource"]
 | 
						|
 | 
						|
        #df_actions = df_actions.drop_duplicates()
 | 
						|
        #print(df_actions.to_string())
 | 
						|
        #print(df_actions.count)
 | 
						|
 | 
						|
        perfspec['vars']['df_actions_dataset'] = df_actions
 | 
						|
        if perfspec['settings']['verbose'] != None or mo.running_in_notebook():
 | 
						|
            print(f"Main audit log prepared for actions data with: {df_actions.count()} rows")
 | 
						|
 | 
						|
    generate_actions_data(perfspec['settings']['main_audit_log'])
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        ##  {mo.icon('lucide:database', color="green")} Load audit logs for Actions
 | 
						|
 | 
						|
        Loading **main audit data logs** from: {perfspec['settings']['main_audit_log']}
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return (generate_actions_data,)
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def save_actions_data(Path, mo, perfspec):
 | 
						|
    def save_actions_data(df_audit_logs,filepath):
 | 
						|
        file_path = Path(filepath)
 | 
						|
        if file_path.exists():
 | 
						|
            #file_path.unlink()
 | 
						|
            print(f"File path already exist: {filepath} DELETE to process")
 | 
						|
            return 
 | 
						|
 | 
						|
        event_seq = []
 | 
						|
        event_sub_seq = []
 | 
						|
 | 
						|
        audit_filepath = Path(filepath)
 | 
						|
        if Path(audit_filepath).exists():
 | 
						|
           audit_filepath.unlink()
 | 
						|
        for c, r in df_audit_logs.iterrows():
 | 
						|
            with open(audit_filepath, "a") as event_file:
 | 
						|
                event_file.write("%s\n" % r['event_type'])
 | 
						|
 | 
						|
    save_actions_data(perfspec['vars']['df_actions_dataset'],perfspec['settings']['actions_filepath'])
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        ###  {mo.icon('lucide:save', color="green")} Save Actions Data 
 | 
						|
 | 
						|
        Save **actions data** logs in: {perfspec['settings']['actions_filepath']}
 | 
						|
 | 
						|
        > If file exists it will not be deleted or overwritted, basically as a **trainded models data source** 
 | 
						|
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return (save_actions_data,)
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def plot_available_fonts(mo, perfspec):
 | 
						|
    def available_fonts():
 | 
						|
        import matplotlib.font_manager
 | 
						|
 | 
						|
        # List all available fonts in Matplotlib
 | 
						|
        available_fonts = [f.name for f in matplotlib.font_manager.fontManager.ttflist]
 | 
						|
        return (available_fonts)
 | 
						|
 | 
						|
    if perfspec['settings']['verbose'] == 'dev':
 | 
						|
        mo.md(
 | 
						|
                f"""
 | 
						|
                ### Matplot available fonts
 | 
						|
 | 
						|
                Fonts: {available_fonts()}
 | 
						|
 | 
						|
                """
 | 
						|
        )
 | 
						|
    return (available_fonts,)
 | 
						|
 | 
						|
 | 
						|
@app.cell
 | 
						|
def main(mo):
 | 
						|
    mo.md("""<a id='main' />""")
 | 
						|
    return
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def actions_distrib(mo, perfspec):
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        #  {mo.icon('lucide:chart-spline', color="orange")}  Plot actions distribution
 | 
						|
 | 
						|
        Show how **Resources** and critical **methods** are concentrated or dristributes
 | 
						|
 | 
						|
        A distribution map is generated with plot graphic and saved from **accions-dataset**<br>
 | 
						|
            to  {perfspec['settings']['actions_distrib_filepath']} <br>
 | 
						|
            usando formato {perfspec['settings']['actions_distrib_format']} <br>
 | 
						|
        )
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def plot_actions_distrib(copy, mo, np, perfspec):
 | 
						|
    def actions_distrib(filename, output_path, output_format):
 | 
						|
        import re
 | 
						|
        from collections import defaultdict, Counter
 | 
						|
        from tqdm import trange, tqdm
 | 
						|
        import time
 | 
						|
        import matplotlib.pyplot as plt
 | 
						|
        import matplotlib
 | 
						|
        import textwrap
 | 
						|
 | 
						|
        font_size = 17
 | 
						|
 | 
						|
        # Set the font to a specific one that you know is available on your system
 | 
						|
        matplotlib.rcParams['font.family'] = 'DejaVu Serif'
 | 
						|
 | 
						|
        raw_text = open(filename, 'r', encoding='utf-8').read()
 | 
						|
        raw_words = raw_text.replace('\n', ',').split(',')[:-1]
 | 
						|
 | 
						|
        def label_wrap(labels):
 | 
						|
            work_labels = copy.deepcopy(labels)
 | 
						|
            for i, label in enumerate(work_labels):
 | 
						|
                work_labels[i] = "\n".join(textwrap.wrap(label,width=15))
 | 
						|
            return work_labels
 | 
						|
 | 
						|
        words = []
 | 
						|
        event_filter = []
 | 
						|
        # Example of filtering out some unwanted events
 | 
						|
        for event in raw_words:
 | 
						|
            if not('collection' in event):
 | 
						|
                words.append(event)
 | 
						|
 | 
						|
        verbs = []
 | 
						|
        resources = []
 | 
						|
 | 
						|
        counter = Counter(words)
 | 
						|
 | 
						|
        for word in words:
 | 
						|
            verb, resource = word.split('_')[0], word.split('_')[1]
 | 
						|
            verbs.append(verb)
 | 
						|
            resources.append(resource)
 | 
						|
 | 
						|
        # verbs = verbs[:200]
 | 
						|
        # resources = resources[:200]
 | 
						|
 | 
						|
        counter_verbs = Counter(verbs)
 | 
						|
        counter_resources = Counter(resources)
 | 
						|
 | 
						|
        verbs_set = list(set(verbs))
 | 
						|
        resources_set = list(set(resources))
 | 
						|
 | 
						|
        verbs_set.sort()
 | 
						|
        resources_set.sort(reverse=True)
 | 
						|
 | 
						|
        verbs = [e for e in verbs_set for k in resources_set]
 | 
						|
        resources = [e for k in verbs_set for e in resources_set]
 | 
						|
 | 
						|
        # color_verb = [counter[e+"_"+k] for e in verbs_set for k in resources_set]
 | 
						|
        area_resource = [3*counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))]
 | 
						|
        texts = [counter[verbs[i]+"_"+resources[i]] for i in range(len(verbs))]
 | 
						|
 | 
						|
        plt.rcParams.update({'font.size': font_size})
 | 
						|
 | 
						|
        fig = plt.figure(figsize=(9, 9), dpi=100)
 | 
						|
 | 
						|
        ax=fig.add_subplot(111, label="1")
 | 
						|
 | 
						|
        ax.scatter(verbs, resources, s=area_resource, alpha=0.4, color='gray')
 | 
						|
 | 
						|
 | 
						|
        ax.set_xlabel("Methods", fontsize=font_size)
 | 
						|
        ax.set_xticks(verbs_set)
 | 
						|
        ax.set_xticklabels(verbs_set, fontsize=font_size-4, linespacing=1.0)
 | 
						|
        ax.xaxis.tick_bottom()
 | 
						|
        ax.set_xlim(-0.5,3.5)
 | 
						|
 | 
						|
        ax.set_ylabel("Resources", fontsize=font_size)
 | 
						|
        ax.set_yticks(resources_set)
 | 
						|
        ax.set_yticklabels(resources_set, fontsize=font_size-8, linespacing=1.0)
 | 
						|
        ax.yaxis.tick_left()
 | 
						|
 | 
						|
        for j in range(len(verbs)):
 | 
						|
            if texts[j] > 20:
 | 
						|
                plt.annotate(str(texts[j]), (verbs[j], resources[j]), ha='center', va='center', fontsize=np.interp(texts[j], [20, 2234], [8, 20]))
 | 
						|
 | 
						|
 | 
						|
        plt.rcParams['grid.linestyle'] = 'dotted'
 | 
						|
        plt.rcParams['grid.alpha'] = 0.3
 | 
						|
 | 
						|
        plt.grid()
 | 
						|
       # plt.show()
 | 
						|
        fig.savefig(output_path, format=output_format, bbox_inches="tight")
 | 
						|
        perfspec['vars']['actions_distrib'] = counter
 | 
						|
        return plt
 | 
						|
 | 
						|
    _plt = actions_distrib(
 | 
						|
            perfspec['settings']['actions_filepath'],
 | 
						|
            perfspec['settings']['actions_distrib_filepath'],
 | 
						|
            perfspec['settings']['actions_distrib_format']
 | 
						|
        )
 | 
						|
 | 
						|
 | 
						|
    if 'actions_distrib' in perfspec['vars'] and len(perfspec['vars']['actions_distrib'].items()) > 0:
 | 
						|
        mo.md(
 | 
						|
            f"""
 | 
						|
 | 
						|
            ## Plot actions distribution
 | 
						|
 | 
						|
            {mo.as_html(_plt.show())}
 | 
						|
 | 
						|
            ## Plot actions distribution
 | 
						|
 | 
						|
            """
 | 
						|
        )
 | 
						|
    return (actions_distrib,)
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def review_actions_distrib(mo, pd, perfspec):
 | 
						|
    def df_actions_table():
 | 
						|
        if len(perfspec['vars']['actions_distrib'].items()) > 0:
 | 
						|
            df = pd.DataFrame(perfspec['vars']['actions_distrib'].items(), columns=['Action', 'Count'])
 | 
						|
            count_filter = mo.ui.slider(start=0, stop=100, value=50, label="Max age")
 | 
						|
            count_filter
 | 
						|
            #_transform_df = mo.ui.dataframe(_df)
 | 
						|
            transform_df = mo.ui.table(df,selection="multi")
 | 
						|
            return transform_df
 | 
						|
        else:
 | 
						|
            return None
 | 
						|
 | 
						|
    transform_df = df_actions_table()
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
 | 
						|
        ## Review actions distribution
 | 
						|
 | 
						|
        {transform_df}
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return df_actions_table, transform_df
 | 
						|
 | 
						|
 | 
						|
@app.cell(hide_code=True)
 | 
						|
def select_actions_distrib(mo, transform_df):
 | 
						|
    mo.md(
 | 
						|
        f"""
 | 
						|
        ## Select actions distribution
 | 
						|
        {mo.md(f"Selecte value: {mo.ui.table(transform_df.value)}")}
 | 
						|
        """
 | 
						|
    )
 | 
						|
    return
 | 
						|
 | 
						|
 | 
						|
@app.cell
 | 
						|
def _():
 | 
						|
    return
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    app.run()
 |