Source code for descriptive.pyeda

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
import numpy as np
import warnings


# --- Create List of Color Palettes ---
red_grad = ['#FF0000', '#BF0000', '#800000', '#400000', '#000000']
pink_grad = ['#8A0030', '#BA1141', '#FF5C8A', '#FF99B9', '#FFDEEB']
purple_grad = ['#4C0028', '#7F0043', '#8E004C', '#A80059', '#C10067']
color_mix = ['#F38BB2', '#FFB9CF', '#FFD7D7', '#F17881', '#E7525B', "#FFCDCD", "#FFBCBC", "#FF9090"]
black_grad = ['#100C07', 'dimgray', '#6D6A6A', '#9B9A9C', '#CAC9CD']
colors = ["lightslategrey", "crimson", "#06344d", "#b4dbe9", "royalblue"]
pastel = sns.color_palette("pastel")


#             ----------------  Libraries Settings  ---------------

pd.set_option("display.max_columns", None)
sns.set_style(style="whitegrid")
plt.rcParams["figure.dpi"] = 80
plt.rc("figure", autolayout=True, figsize=(10, 6))
warnings.filterwarnings("ignore")


#             ----------------  EDA Methods  ---------------


[docs]def show_values(axs, orient="v", space=.01): def _single(axes2): if orient == "v": for p in axes2.patches: _x = p.get_x() + p.get_width() / 2 _y = p.get_y() + p.get_height() + (p.get_height()*0.01) value = '{:.2f}'.format(p.get_height()) axes2.text(_x, _y, value, ha="center", color=colors[0]) elif orient == "h": for p in axes2.patches: _x = p.get_x() + p.get_width() + float(space) _y = p.get_y() + p.get_height() - (p.get_height()*0.5) value = '{:.2f}'.format(p.get_width()) axes2.text(_x, _y, value, ha="left", color=colors[0]) if isinstance(axs, np.ndarray): for idx, ax in np.ndenumerate(axs): _single(ax) else: _single(axs)
[docs]def import_dataset(file_name: str): """ Read cvs data file. :param file_name: string contain the csv file name :return: pandas dataframe """ try: df = pd.read_csv(file_name, parse_dates=True, header=0) return df except FileNotFoundError: print("No such File, upload your dataset CSV file to the project")
[docs]def read_dataset(data): """ Reading data from a variable. :param data: variable :return: data """ return data.head()
[docs]def display_dataset_info(data) -> None: """ Print dataset info. :param data: variable :return: total rows and columns """ print("\033[1m"+".: Dataset Info :."+"\033[0m") print("*" * 30) print("Total Rows: " + "\033[1m", data.shape[0]) print("\033[0m" + "Total Columns: " + "\033[1m", data.shape[1]) print("\033[0m" + "*" * 30) print("\n")
[docs]def display_dataset_detail(data): """ Print dataset details. :param data: variable :return: dataset details """ print("\033[1m"+".: Dataset Details :. " + "\033[0m") print("*" * 30) print(data.info(memory_usage=False))
[docs]def display_summary_data(data): """ The function prints out a summary table of columns. - number of unique. - Null values. - Null Percentage. - DataType. :param data: variable :return: Summarize columns """ summary_tabel = pd.DataFrame({"Unique": data.nunique(), "Null": data.isna().sum(), "NullPercent": data.isna().sum() / len(data), "Types": data.dtypes.values}) print(summary_tabel)
[docs]def display_column_types(data): """ Separate numerical columns and categorical columns. :param data: variable :return: list of numerical and categorical feature names """ numeric_variable_lst = data.select_dtypes(include=["int64", "float64"]).columns.tolist() categorical_variable_lst = data.select_dtypes(include=["object", "category"]).columns.tolist() print(f"Numerical Columns: {numeric_variable_lst}\nCategorical Columns: {categorical_variable_lst}")
[docs]def select_numeric_variables(data) -> list: """ Selecting numerical variables. :param data: variable :return: all numeric features in a dataset """ numeric_variable_lst = data.select_dtypes(exclude="object") return list(numeric_variable_lst)
[docs]def select_categorical_variables(data) -> list: """ Selecting categorical variables. :param data: variable :return: all categorical features in a dataset """ categorical_variable_lst = data.select_dtypes(include=["object", "category"]) return list(categorical_variable_lst)
[docs]def visualize_distribution_of_numeric_col(data_frame, column_name: str, bins: int) -> None: """ Visualize the distribution of a numeric column. :param data_frame: variable :param column_name: str :param bins: int :return: Histogram, boxplot, q-q plot, skewness and kurtosis values """ # Testing column Type if not data_frame[column_name].dtype in ["int64", "float64"]: raise TypeError(f"{column_name} Is not a Numeric Column. Only Numeric Column is Allowed") print(f"\033[1m" + f".: {column_name.title()} Skewness & Kurtosis :. " + "\033[0m") print("*" * 40) print("Skewness: "+"\033[1m{:.3f}".format(data_frame[column_name].skew(axis=0, skipna=True))) print("\033[0m"+"Kurtosis:" + "\033[1m{:.3f}".format(data_frame[column_name].kurt(axis=0, skipna=True))) print("\n") fig = plt.figure(figsize=(10, 6)) fig.suptitle(f"{column_name.title()} Distribution", fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="14", fontfamily="sans-serif", color=black_grad[0]) # Histogram fig.add_subplot(2, 2, 2) ax_1 = sns.histplot(data=data_frame, x=column_name, kde=True, color=colors[1], bins=bins) plt.xlabel(column_name.title(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(None) plt.grid(axis="y", alpha=0) plt.grid(axis="x", alpha=0) ax_1.set(frame_on=False) # Q-Q Plot ax_2 = fig.add_subplot(2, 2, 4) plt.title("Q-Q Plot", fontweight="bold", fontsize=14, fontfamily="sans-serif", color=black_grad[1]) qqplot(data_frame[column_name], fit=True, line="45", ax=ax_2, markerfacecolor=colors[1], alpha=0.6) # Box Plot fig.add_subplot(1, 2, 1) ax_3 = sns.boxplot(data_frame, y=column_name, color=colors[1], boxprops=dict(alpha=0.8), linewidth=1.5) plt.ylabel(column_name, fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="y", alpha=0) plt.grid(axis="x", alpha=0) ax_3.set(frame_on=False) plt.show()
[docs]def visualize_distribution_of_categorical_col(data_frame, column_name: str): """ Visualize the distribution of a categorical column. :param data_frame: variable :param column_name: str :return: histogram, and pie charts """ # Testing column Type if not data_frame[column_name].dtype in ["object", "category"]: raise TypeError(f"{column_name} Is a Numeric Column. Only Categorical Column is Allowed") print("*" * 25) print(f"\033[1m" + f".: {column_name.title()} Total :. " + "\033[0m") print("*" * 25) print(data_frame[column_name].value_counts(dropna=False)) order = data_frame[column_name].value_counts().index fig = plt.figure(figsize=(10, 6)) fig.suptitle(f"{column_name.title()} Distribution", fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0]) # Pie Chart plt.subplot(1, 2, 1) labels = data_frame[column_name].unique() plt.pie(data_frame[column_name].value_counts(), colors=pastel, pctdistance=0.7, autopct="%.2f%%", labels=labels, wedgeprops=dict(alpha=0.8, edgecolor=colors[0]), textprops={"fontsize": 12}, startangle=90) centre = plt.Circle((0, 0), 0.45, fc="white", edgecolor=colors[0]) plt.gcf().gca().add_artist(centre) # Histogram plt.subplot(1, 2, 2) ax = sns.countplot(x=column_name, data=data_frame, palette=pastel, order=order, edgecolor=black_grad[2], alpha=0.85) show_values(ax, "v") plt.xlabel(None) plt.ylabel(None) plt.xticks(rotation=45) ax.set_yticklabels([]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="y", alpha=0.1) plt.grid(axis="x", alpha=0) ax.set(frame_on=False) plt.show()
[docs]def visualize_pair_plot(data, numerical_columns: list, by_categorical_col: str): """ Visualize the relationship between multiple numeric columns by categorical column. :param data: variable :param numerical_columns: list :param by_categorical_col: str :return: pair plot """ g = sns.pairplot(data, vars=numerical_columns, hue=by_categorical_col, palette="husl", diag_kind="kde", kind="scatter", grid_kws=dict(diag_sharey=True), plot_kws=dict(s=20, alpha=0.7), diag_kws=dict(fill=False, linewidth=3.0)) g.legend.remove() handles = g._legend_data.values() labels = g._legend_data.keys() g.fig.legend(handles=handles, labels=labels, loc="upper center", ncol=4) g.fig.subplots_adjust(top=0.99, bottom=0.20, ) plt.show()
[docs]def visualize_line_plot(data, first_numeric: str, second_numeric: str, categorical_col: str, title: str = "Add Chart Title"): """ Visualize the relationship between two numeric columns and the categories of a categorical column. Add title to chart as a string. :param data: variable :param first_numeric: str :param second_numeric: str :param categorical_col: str :param title: str :return: lm plot """ g = sns.lmplot(data=data, x=first_numeric, y=second_numeric, hue=categorical_col, palette=pastel, legend=False, lowess=True) plt.xlabel(first_numeric, fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(second_numeric, fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ticklabel_format(style="plain", axis="both") plt.grid(axis="both", alpha=0.4, lw=0.2) g.ax.set(frame_on=False) g.ax.set_title(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="14", fontfamily="sans-serif", color=black_grad[0]) g.ax.legend(title=categorical_col.title(), bbox_to_anchor=(1, 1.08), loc="upper right", frameon=True) g.ax.set_xlabel(first_numeric.title(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) g.ax.set_ylabel(second_numeric.title(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout() plt.show()
[docs]def visualize_basic_kde(data, first_numeric: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle" ): """ Visualize the kernel density estimate of a numeric column. :param data: variable :param first_numeric: str :param title: str :param subtitle: str :return: KDE Plot """ plt.figure(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.075, y=0.98, ha="left", fontsize="14", fontfamily="sans-serif", color=black_grad[0]) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=colors[0], loc="left", pad=10) ax = sns.kdeplot(x=data[first_numeric], fill=True, common_norm=False, palette=pastel, alpha=.5, linewidth=0) plt.xlabel(first_numeric.title(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="y", alpha=0.1) plt.grid(axis="x", alpha=0) ax.set(frame_on=False) plt.show()
[docs]def visualize_kde(data, first_numeric: str, categorical_col: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle" ): """ Visualize the kernel density estimate of a numeric column per categorical column. :param data: variable :param first_numeric: str :param categorical_col: str :param title: str :param subtitle: str :return: KDE Plot """ fig = plt.figure(figsize=(10, 6)) fig.suptitle(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0]) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=colors[0], loc="left", pad=12) ax = sns.kdeplot(x=data[first_numeric], hue=data[categorical_col], fill=True, common_norm=False, palette=pastel, alpha=.5, linewidth=0) plt.xlabel(first_numeric.title(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout() plt.grid(axis="y", alpha=0) plt.grid(axis="x", alpha=0) ax.set(frame_on=False) plt.show()
[docs]def visualize_advanced_kde(data, first_numeric: str, second_numeric: str, categorical_col: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle"): """ Visualize the kernel density estimate of a numeric columns by categorical column. :param data: variable :param first_numeric: str :param second_numeric: str :param categorical_col: str :param title: str :param subtitle: str :return: KDE Plot """ fig = plt.figure(figsize=(10, 6)) fig.suptitle(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="14", fontfamily="sans-serif", color=black_grad[0]) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=colors[0], loc="left", pad=12) ax = sns.kdeplot(x=data[first_numeric], y=data[second_numeric], hue=data[categorical_col], shade=True, color=colors[1]) plt.xlabel(first_numeric.title(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout() plt.grid(axis="y", alpha=0) plt.grid(axis="x", alpha=0) ax.set(frame_on=False) plt.show()
[docs]def display_describe_data(data) -> None: """ Calculate basic statistics for the whole data. :param data: variable :return: prints out describe() """ print(data.describe(include="all").round(2))
[docs]def visualize_boxplot(data, numeric_column: str, categorical_column: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle"): """ Visualize the distribution of a numeric column by the categories of a categorical column. Add new title and subtitle content as a string. :param data: variable :param numeric_column: str :param categorical_column: str :param title: str :param subtitle: str :return: Box Plot """ if not data[numeric_column].dtype in ["int64", "float64"]: raise TypeError(f"{numeric_column} Is not a Numeric Column. Only Numeric Column is Allowed") fig = plt.figure(figsize=(10, 6)) fig.suptitle(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0]) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=colors[0], loc="left", pad=12) medians = data.groupby([categorical_column])[numeric_column].median().round(2).sort_values() vertical_offset = data[numeric_column].median() * 0.05 box_plot = sns.boxplot(data=data, x=categorical_column, y=numeric_column, medianprops={"color": "coral"}, order=medians.index, palette=pastel) for xtick in box_plot.get_xticks(): box_plot.text(xtick, medians[xtick] + vertical_offset, medians[xtick], horizontalalignment="center", size=12, color=colors[2], weight="bold") plt.tight_layout() plt.xlabel(None) plt.ylabel(numeric_column.title(), fontweight="bold", fontsize=10, fontfamily="sans-serif", color=colors[0]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0.1) box_plot.set(frame_on=False) plt.show()
[docs]def visualize_basic_scatter_plot(data, first_column: str, second_column: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle"): """ Visualize the relationship between two numeric columns. Add new title and subtitle content as a string. :param data: variable :param first_column: str :param second_column: str :param title: str :param subtitle: str :return: Basic scatter plot """ plt.figure(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=colors[0], loc="left", pad=8) ax = sns.scatterplot(data=data, x=first_column, y=second_column, legend="full") plt.xlabel(first_column, fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(second_column, fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ticklabel_format(style="plain", axis="both") plt.grid(axis="both", alpha=0.4, lw=0.2) ax.set(frame_on=False) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.show()
[docs]def visualize_advanced_scatter_plot(data, first_numeric: str, second_numeric: str, categorical_col: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle"): """ Vis the relationship between two numeric variable's by a third categorical variable, to dictate the color of data point's. Add new title and subtitle content as a string. :param data: Data frame variable :param first_numeric: str :param second_numeric: str :param categorical_col: str :param title: str :param subtitle: str :return: Advanced scatter plot """ plt.figure(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize=12, fontfamily="sans-serif", color=colors[0], loc="left", pad=10) ax = sns.scatterplot(data=data, x=first_numeric, y=second_numeric, hue=categorical_col, palette="tab10", legend="full") plt.xlabel(first_numeric, fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(second_numeric, fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ticklabel_format(style="plain", axis="both") plt.grid(axis="both", alpha=0.4, lw=0.5) ax.legend(title=categorical_col.title(), bbox_to_anchor=(0.98, 0.90), loc="upper left", frameon=True) ax.set(frame_on=False) plt.grid(axis="x", alpha=0) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.show()
[docs]def visualize_stack_bar(data, first_categorical: str, second_categorical: str, title: str = "Add Chart Title", subtitle: str = "explain ur data viz by subtitle"): """ Visualize percentage relationship using two categorical variables. Add new title and subtitle content as a string. :param data: Data fame variable :param first_categorical: str :param second_categorical: str :param title: str :param subtitle: str :return: stacked bar plot """ df = (data .groupby(first_categorical)[second_categorical] .value_counts(normalize=True) .mul(100) .round(2) .unstack()) fig, ax = plt.subplots(figsize=(12, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.059, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize=12, fontfamily="sans-serif", color=colors[0], loc="left", pad=15) # Plot df.plot.bar(stacked=True, color=pastel, ax=ax, width=0.4) # Adding bar labels for c in ax.containers: labels = [str(round(v.get_height(), 2)) + "%" if v.get_height() > 0 else '' for v in c] ax.bar_label(c, label_type='center', labels=labels, color=colors[0], size=12) # add a container object "c" as first argument # Removing spines for s in ["top", "right"]: ax.spines[s].set_visible(False) # Adding tick and axes labels ax.legend(title=second_categorical.title(), bbox_to_anchor=(0.98, 0.90), loc="upper left") ax.grid(axis="both", alpha=0.4, lw=0.5) ax.set(frame_on=False, xlabel=None) ax.tick_params(labelsize=12, labelrotation=0) ax.set_ylabel("Percentage", size=14, color=colors[0]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.show()
[docs]def vis_advanced_stack_bar(data, first_categorical: str, second_categorical: str, third_categorical: str, title: str = "Add Chart Title", subtitle: str = "explain ur data viz by subtitle"): """ Visualize percentage relationship using three categorical variables. Add new title and subtitle content as a string. :param data: variable :param first_categorical: str :param second_categorical: str :param third_categorical: str :param title: str :param subtitle: str :return: stacked bar plot """ df = data.groupby([first_categorical, second_categorical])[third_categorical].value_counts(normalize=True)\ .mul(100).round(2).unstack() _, ax = plt.subplots(figsize=(11, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.26, y=0.98, fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=15) df.plot(kind="bar", stacked=True, width=0.4, ax=ax, color=pastel) # Adding bar labels for c in ax.containers: labels = [str(round(v.get_height(), 2)) + "%" if v.get_height() > 0 else '' for v in c] ax.bar_label(c, label_type='center', labels=labels, color=colors[0], size=10) # add a container object "c" as first argument # Removing spines for s in ["top", "right"]: ax.spines[s].set_visible(False) ax.legend(title=third_categorical.title(), bbox_to_anchor=(0.98, 0.90), loc="upper left", frameon=True) plt.grid(axis="both", alpha=0.4, lw=0.5) ax.set(frame_on=False, xlabel=None) plt.xticks(rotation=45) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.show()
[docs]def visualize_pie_chart(data, categorical_col: str, numerical_col: str, title: str = "Add Chart Title", subtitle: str = "explain ur data viz by subtitle"): """ Visualize pie chart. Add new title and subtitle content as a string. :param data: variable :param categorical_col: str :param numerical_col: str :param title: str :param subtitle: str :return: pie chart """ _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.24, y=0.99, fontsize="16", ha="left", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color="dimgray", loc="left", pad=10) labels = data[categorical_col].unique() df = data.groupby(categorical_col)[numerical_col].sum() plt.pie(df, colors=pastel, pctdistance=0.7, autopct="%.2f%%", labels=labels, wedgeprops=dict(alpha=0.8, edgecolor="floralwhite"), textprops={"fontsize": 12}) centre = plt.Circle((0, 0), 0.45, fc="white", edgecolor="aliceblue") plt.gcf().gca().add_artist(centre) ax.legend(title=categorical_col.title(), bbox_to_anchor=(1.2, 0.90), loc="upper left", frameon=True, shadow=True) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.show()
[docs]def visualize_point_plot(data, numerical_col: str, categorical_col: str, title: str = "Add Chart Title", subtitle: str = "Explain ur data viz by subtitle "): """ Visualize the mean of a numerical column by the categories of a categorical column. Add new title and subtitle content as a string. :param data: variable :param numerical_col: str :param categorical_col: str :param title: str :param subtitle: str :return: point plot """ plt.figure(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.064, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle, fontweight="heavy", fontsize=12, fontfamily="sans-serif", color=colors[0], loc="left", pad=15) ax = sns.pointplot(data, x=categorical_col, y=numerical_col, palette=pastel) plt.xlabel(categorical_col.title(), fontweight="bold", fontsize=11, fontfamily="sans-serif", color=colors[0]) plt.ylabel(f"\nAverage {numerical_col.title()}", fontweight="bold", fontsize=11, fontfamily="sans-serif", color=colors[0]) plt.grid(axis="both", alpha=0.3, lw=0.1, color="grey", which="major", ) plt.tight_layout() ax.set(frame_on=False) plt.show()
[docs]def visualize_multi_numeric_columns_avg(data, numerical_col: list, categorical_col: str, title: str = "Add chart title ", subtitle: str = "Explain ur data viz by subtitle "): """ Visualize the mean of a multiple numeric columns by the categories of a categorical column. Add new Title and subtitle content to describe your chart. :param data: variable :param numerical_col: list :param categorical_col: str :param title: str :param subtitle: str :return: point plot chart """ axs = data.groupby([categorical_col])[numerical_col].mean().round(2).plot(kind="bar", color=pastel) plt.suptitle(title.title(), fontweight="heavy", x=0.039, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize=12, fontfamily="sans-serif", color=colors[0], loc="left", pad=10) axs.grid(False) plt.tight_layout() axs.set(frame_on=False) show_values(axs, "v") plt.xlabel(None) plt.xticks(rotation=45) axs.legend(title=f"{categorical_col.title()}", bbox_to_anchor=(1.2, 1), shadow=True, title_fontsize="x-large") plt.show()
[docs]def visualize_basic_bar_plot(data, numerical_col: str, categorical_col: str, title: str = "Add chart title ", subtitle: str = "Explain ur data viz by subtitle "): """ Visualize the mean of a numeric column by the categories of a categorical column. Add new Title and subtitle content to describe your chart. :param data: variable :param numerical_col: str :param categorical_col: str :param title: str :param subtitle: str :return: Bar chart """ plt.figure(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.075, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle, fontweight="heavy", fontsize=12, fontfamily="sans-serif", color=colors[0], loc="left", pad=15) order = data.groupby([categorical_col])[numerical_col].mean().round().sort_values(ascending=False).index ax = sns.barplot(data=data, y=categorical_col, x=numerical_col, ci=None, palette=pastel, order=order) show_values(ax, "h", space=0) plt.xlabel(f"\nAverage {numerical_col.title()}", fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(f"{categorical_col.title()}", fontweight="bold", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.grid(axis="x", alpha=0) plt.tight_layout() ax.set(frame_on=False) plt.show()
[docs]def visualize_advanced_bar_plot(data, categorical_col: str, numerical_col: str, second_categorical_col: str, title: str = "add chart title", subtitle: str = "Explain ur data viz by subtitle "): """ Summarize two categorical columns by numerical column . Add new title and subtitle content as a string. :param data: variable :param categorical_col: str :param numerical_col: str :param second_categorical_col: str :param title: str :param subtitle: str :return: bar chart """ _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.060, y=0.990, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=colors[0], loc="left", pad=10) df = data.groupby([categorical_col, second_categorical_col])[numerical_col].count().reset_index()\ .sort_values(numerical_col, ascending=False) ax = sns.barplot(data=df, x=numerical_col, y=categorical_col, hue=second_categorical_col, palette=pastel, errwidth=0) for rect in ax.containers: ax.bar_label(rect, color=colors[0]) ax.set(frame_on=False) plt.xlabel(f"{numerical_col}", labelpad=15, fontweight="bold", fontsize=16, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(f"{categorical_col}", labelpad=15, fontweight="bold", fontsize=16, fontfamily="sans-serif", color=black_grad[1]) ax.legend(title=f"{second_categorical_col.title()}", bbox_to_anchor=(1.2, 1), shadow=True, title_fontsize="x-large") plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="x", alpha=0.2) plt.show()
[docs]def visualize_countplot(data, first_categorical_col: str, second_categorical_col: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the count of a categorical column by the categories of another categorical column. Add new title and subtitle content as a string. :param data: variable :param first_categorical_col: str :param second_categorical_col: str :param title: str :param subtitle: str :return: count plot """ _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.014, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=38) ax = sns.countplot(x=first_categorical_col, hue=second_categorical_col, data=data, palette=pastel) ax.set(xlabel=first_categorical_col, frame_on=False, ylabel=None) for c in ax.containers: # custom label calculates percent and add an empty string so 0 value bars don't have a number labels = [f'{h / data[second_categorical_col].count() * 100:0.2f}%' if (h := v.get_height()) > 0 else '' for v in c] ax.bar_label(c, labels=labels, label_type='edge', fontsize=12, color=black_grad[1]) ax.legend(title=second_categorical_col.title(), bbox_to_anchor=(1, 1.02), loc="upper left") plt.tight_layout() plt.xlabel(f"{first_categorical_col}", labelpad=10, fontweight="bold", fontsize=16, fontfamily="sans-serif", color=black_grad[1]) plt.show()
[docs]def vis_heatmap(data): """ Visualize the correlation between multiple numeric columns. :param data: variable :return: heatmap """ numeric_features = data.select_dtypes(exclude=["object", "category"]) plt.figure(figsize=(10, 6)) plt.suptitle("Correlation Map", fontweight="heavy", x=0.15, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) sns.heatmap(numeric_features.corr(), cmap="Reds", annot=True, linewidths=0.1) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.show()
[docs]def visualize_linear_regression(data, first_numeric: str, second_numeric: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the relationship between two numeric column. Add new title and subtitle content as a string. :param data: variable :param first_numeric: str :param second_numeric: str :param title: str :param subtitle: str :return: regression plot """ corr = data[first_numeric].corr(data[second_numeric]) print("*" * 25) print("Correlation between ", first_numeric.title(), " and ", second_numeric.title(), " is ", round(corr, 2)) print("*" * 25) _, ax = plt.subplots(figsize=(10, 6)) sns.regplot(x=first_numeric, y=second_numeric, data=data, line_kws={"color": "red"}, ax=ax) plt.suptitle(title.title(), fontweight="heavy", x=0.058, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="10", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=38) plt.xlabel(first_numeric.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(second_numeric.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0.4) plt.show()
[docs]def visualize_causation(data, first_numeric: str, second_numeric: str, category_col: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the relationship between two numeric column by a categorical column. Add new title and subtitle content as a string. :param data: variable :param first_numeric: str :param second_numeric: str :param category_col: str :param title: str :param subtitle: str :return: regression plot """ sns.lmplot(x=first_numeric, y=second_numeric, hue=category_col, data=data, palette=pastel, legend=False, height=6, aspect=1.5) plt.suptitle(title.title(), fontweight="heavy", x=0.065, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="10", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=40) plt.xlabel(first_numeric.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(second_numeric.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.legend(title=category_col.title(), bbox_to_anchor=(1, 1.02), loc="upper left") plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0.4) plt.show()
[docs]def save_data_to_csv_file(data, filename: str): """ Save data to a csv file. :param data: variable :param filename: string :return: updated csv data file """ new_data = data.to_csv(filename, index=False) return new_data
[docs]def visualize_time_relationship(data, date_colum: str, numerical_column: str, filter_by: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the sum of all shared data points for continuous variable by date variable, filter by : - Year - Month - Day Add new title and subtitle content as a string. :param data: variable :param date_colum: str :param numerical_column: str :param filter_by: str :param title: str :param subtitle: str :return: Line Chart """ fig, ax = plt.subplots(figsize=(10, 6)) if filter_by.lower() == "month": data[date_colum] = pd.to_datetime(data[date_colum]) data[filter_by] = data[date_colum].dt.month sns.lineplot(data=data, x=filter_by, y=numerical_column, linewidth=1, estimator="sum", errorbar=None, ax=ax) elif filter_by.lower() == "year": data[date_colum] = pd.to_datetime(data[date_colum]) data[filter_by] = data[date_colum].dt.year sns.lineplot(data=data, x=filter_by, y=numerical_column, linewidth=1, estimator="sum", errorbar=None, ax=ax) elif filter_by.lower() == "day": data[date_colum] = pd.to_datetime(data[date_colum]) data[filter_by] = data[date_colum].dt.day sns.lineplot(data=data, x=filter_by, y=numerical_column, linewidth=1, estimator="sum", errorbar=None, ax=ax) plt.suptitle(title.title(), fontweight="heavy", x=0.065, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="10", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=20) plt.xlabel(filter_by.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(numerical_column.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0.4) ax.locator_params(integer=True) plt.show()
[docs]def visualize_time_relationship_by_categorical_variable(data, date_colum: str, numerical_col: str, categorical_colum: str, filter_by: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the sum between date column and continues column with categorical column, filter by : - Year - Month - Day Add new title and subtitle content as a string. :param data: variable :param date_colum: str :param numerical_col: str :param filter_by: str :param title: str :param categorical_colum: str :param subtitle: str :return: Line Chart """ fig, ax = plt.subplots(figsize=(10, 6)) if filter_by.lower() == "month": data[date_colum] = pd.to_datetime(data[date_colum]) data[filter_by] = data[date_colum].dt.month sns.lineplot(data=data, x=filter_by, y=numerical_col, hue=categorical_colum, linewidth=2, estimator="sum", errorbar=None, color=pastel, ax=ax) elif filter_by.lower() == "year": data[date_colum] = pd.to_datetime(data[date_colum]) data[filter_by] = data[date_colum].dt.year sns.lineplot(data=data, x=filter_by, y=numerical_col, hue=categorical_colum, linewidth=2, estimator="sum", color=pastel, errorbar=None, ax=ax) elif filter_by.lower() == "day": data[date_colum] = pd.to_datetime(data[date_colum]) data[filter_by] = data[date_colum].dt.day sns.lineplot(data=data, x=filter_by, y=numerical_col, hue=categorical_colum, linewidth=2, estimator="sum", color=pastel, errorbar=None, ax=ax) plt.suptitle(title.title(), fontweight="heavy", x=0.065, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="10", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=20) plt.xlabel(filter_by.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.ylabel(numerical_col.upper(), fontweight="regular", fontsize=11, fontfamily="sans-serif", color=black_grad[1]) plt.tight_layout(rect=[0, 0.04, 1, 1.025]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0.4) ax.locator_params(integer=True) plt.legend(title=categorical_colum.title(), bbox_to_anchor=(1, 1.02), loc="upper left", shadow=True) plt.show()
[docs]def vis_highest_percentage_datapoints(data, first_categorical_col: str, second_categorical_col: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the highest percentage of datapoint values, for categorical variable grouped by second categorical variable. Add new title and subtitle content as a string. :param data: variable :param first_categorical_col: str :param second_categorical_col: str :param title: str :param subtitle: str :return: count plot """ _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.044, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=40) ax = sns.countplot(x=first_categorical_col, hue=second_categorical_col, data=data, color="aquamarine", saturation=0.5) ax.set(xlabel=first_categorical_col, frame_on=False, ylabel=None) for c in ax.containers: # custom label calculates percent and add an empty string so 0 value bars don't have a number labels = [f'{h / data[second_categorical_col].count() * 100:0.1f}%' if (h := v.get_height()) > 0 else '' for v in c] ax.bar_label(c, labels=labels, label_type='edge', fontsize=12, color=black_grad[1]) # highest value patch_h = [patch.get_height() for patch in ax.patches] idx_tallest = np.argmax(patch_h) ax.patches[idx_tallest].set_facecolor("slateblue") ax.legend(title=second_categorical_col.title(), bbox_to_anchor=(1, 1.02), loc="upper left") plt.tight_layout() plt.xlabel(f"{first_categorical_col}", labelpad=10, fontweight="bold", fontsize=16, fontfamily="sans-serif", color=black_grad[1]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0) plt.show()
[docs]def vis_lowest_percentage_datapoints(data, first_categorical: str, second_categorical: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle "): """ Visualize the lowest percentage of datapoint values, for categorical variable grouped by second categorical variable. Add new title and subtitle content as a string. :param data: variable :param first_categorical: str :param second_categorical: str :param title: str :param subtitle: str :return: count plot """ _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.044, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=40) ax = sns.countplot(x=first_categorical, hue=second_categorical, data=data, color="aquamarine", saturation=0.5) for c in ax.containers: # custom label calculates percent and add an empty string so 0 value bars don't have a number labels = [f'{h / data[second_categorical].count() * 100:0.1f}%' if (h := v.get_height()) > 0 else '' for v in c] ax.bar_label(c, labels=labels, label_type='edge', fontsize=10, color=colors[0]) # Lowest value patch_h = [patch.get_height() for patch in ax.patches] idx_tallest = np.argmin(patch_h) ax.patches[idx_tallest].set_facecolor("tomato") ax.set(xlabel=first_categorical, frame_on=False, ylabel=None) ax.legend(title=second_categorical.title(), bbox_to_anchor=(1, 1.02), loc="upper left") plt.tight_layout() plt.xlabel(f"{first_categorical}", labelpad=10, fontweight="bold", fontsize=16, fontfamily="sans-serif", color=black_grad[1]) plt.grid(axis="x", alpha=0) plt.grid(axis="y", alpha=0) plt.show()
[docs]def vis_top_highest_average(data_frame, categorical_column: list, numerical_column: str, avg_numbers: list, title: str = "add chart title", subtitle: str = "Explain ur data viz by subtitle "): """ Visualize the highest average values. add avg_numbers to determine which bars to be colored. Add new title and subtitle content as a string. :param data_frame: variable :param categorical_column: list :param numerical_column: str :param avg_numbers: list :param title: str :param subtitle: str :return: bar chart """ df = data_frame.groupby(categorical_column)[numerical_column].mean().round(2).sort_values(ascending=False)\ .nlargest(10) _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.069, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=10) set_colors = [colors[1] if x in avg_numbers else colors[0] for x in df.values] fig = sns.barplot(data=pd.DataFrame(df).transpose(), orient="h", palette=set_colors) fig.set_ylabel(None) fig.set_xlabel(None) fig.set_yticklabels(fig.get_yticklabels(), rotation=0) fig.set_xticklabels([]) show_values(fig, "h") plt.grid(axis="y", alpha=0) plt.grid(axis="x", alpha=0) ax.set(frame_on=False) plt.tight_layout() plt.show()
[docs]def vis_the_highest_label_pie_chart(data, categorical_col: str, numerical_col: str, title: str = "add chart title", subtitle: str = "explain ur data viz by subtitle"): """ Visualize the highest label in pie chart. Add new title and subtitle content as a string. :param data: variable :param categorical_col: str :param numerical_col: str :param title: str :param subtitle: str :return: pie chart """ _, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.27, y=0.99, fontsize="16", ha="left", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="10", fontfamily="sans-serif", color="dimgray", loc="left", pad=10) labels = data[categorical_col].unique() df = data.groupby(categorical_col)[numerical_col].mean().round(2) clrs = [colors[0] if (x < max(df.values)) else colors[1] for x in df.values] plt.pie(df, colors=clrs, pctdistance=0.7, autopct="%.2f%%", labels=labels, labeldistance=1.1, wedgeprops=dict(alpha=0.8, edgecolor="floralwhite"), textprops={"fontsize": 10}) centre = plt.Circle((0, 0), 0.45, fc="white", edgecolor=colors[0]) plt.gcf().gca().add_artist(centre) ax.legend(title=categorical_col.title(), bbox_to_anchor=(0.5, -0.04), ncol=4, loc="upper center", frameon=True, shadow=True) plt.tight_layout() plt.show()
[docs]def vis_top_ten_values(data, first_column: str, by_second_column: str, color_bar: list, title: str = "explain ur data viz by subtitle", subtitle: str = "explain ur data viz by subtitle"): """ Visualize top ten values, add color_bar as integer list to determine which bars to be colored. Add new title and subtitle content as a string. :param data: Variable :param first_column: str :param by_second_column: str :param color_bar: list :param title: str :param subtitle: str :return: Bar plot """ fig, ax = plt.subplots(figsize=(10, 6)) plt.suptitle(title.title(), fontweight="heavy", x=0.059, y=0.98, ha="left", fontsize="16", fontfamily="sans-serif", color=black_grad[0] ) plt.title(subtitle.capitalize(), fontweight="heavy", fontsize="12", fontfamily="sans-serif", color=black_grad[1], loc="left", pad=10) top_10 = data.groupby([first_column])[by_second_column].count().sort_values(ascending=False).iloc[:10] set_colors = [colors[1] if x in color_bar else colors[0] for x in top_10.values] fig = sns.barplot(data=pd.DataFrame(top_10).transpose(), orient="h", palette=set_colors) show_values(fig, "h") plt.xlabel(f"{first_column.title()}", labelpad=10, fontweight="bold", fontsize=16, fontfamily="sans-serif", color=black_grad[1]) plt.xlabel(None) plt.xticks([]) plt.grid(axis="y", alpha=0) plt.grid(axis="x", alpha=0) ax.set(frame_on=False) sns.despine(right=True, top=True, left=True) plt.tight_layout() plt.show()