Pediacitric_Appendicitis_Python/setup_script.py at main · AnasAmchaar/Pediacitric_Appendicitis_Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# This script sets up necessary utility functions and packages. Run it before any of the other scripts!
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mannwhitneyu, chi2_contingency
from statsmodels.stats.multitest import multipletests
import random

########################## Utility functions

# ! setup_script.py ==================
# todo: Random Forest + Bootstrap Variable Selection
# ! ==================================


def compute_area(x_values, y_values):
    """Calculate area under curve"""
    a = 0
    for i in range(len(x_values) - 1):

        # ? calculates area under each two consecutive points
        # ? and sums everything up
        x1 = x_values[i]
        x2 = x_values[i + 1]
        y1 = y_values[i]
        y2 = y_values[i + 1]
        a += (y1 + y2) * (x2 - x1) / 2

    return a

def make_transparent(color, alpha=0.4):
    """Make a color transparent by adding alpha channel"""
    import matplotlib.colors as mcolors

    # Convert color to RGB if it's a named color
    if isinstance(color, str):
        rgba = mcolors.to_rgba(color)
        return mcolors.to_rgba(rgba, alpha=alpha)
    else:
        # Assuming color is already RGB or RGBA
        return (*color[:3], alpha)

def rf_var_selection(X, y, q, B=100):
    """Random forest variable selection with bootstrap"""
    random.seed(42)
    np.random.seed(42)

    freqs_b = {col: 0 for col in X.columns}
    n_samples = X.shape[0]

    for b in range(B):
        # Bootstrap sampling
        idx_b = np.random.choice(range(n_samples), size=n_samples, replace=True)
        X_b = X.iloc[idx_b]
        y_b = y.iloc[idx_b]

        # Train random forest
        rf_b = RandomForestClassifier(random_state=b)
        rf_b.fit(X_b, y_b)

        # Get feature importances
        imps_b = rf_b.feature_importances_

        # Get top q features
        sorted_indices = np.argsort(imps_b)[-q:]
        for idx in sorted_indices:
            freqs_b[X.columns[idx]] += 1

    # Normalize frequencies
    for key in freqs_b:
        freqs_b[key] /= B

    return freqs_b

def summary_stats(data, y, adjust_method="hommel", seed=42):
    """Calculate statistical tests between groups"""

    # ? Returns p-values which is a measure of how 2 groups
    # ? are different. The lower the p-value, the more significant
    np.random.seed(seed)

    pvals = {col : 0 for col in data.columns}
    stats = {col : 0 for col in data.columns}

    # Convert y to numeric if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y_numeric = pd.factorize(y)[0]
    else:
        y_numeric = y

    for col in data.columns:
        x1 = data.loc[y_numeric == 0, col].dropna()
        x2 = data.loc[y_numeric == 1, col].dropna()
        if pd.api.types.is_numeric_dtype(data[col]):
            # Wilcoxon test for numeric data
            stat, p_value = mannwhitneyu(x1, x2)
        else:
            # Chi-square test for categorical data
            contingency_table = pd.crosstab(data[col], y)
            stat, p_value, _, _ = chi2_contingency(contingency_table)
        pvals[col] = p_value
        stats[col] = stat

    # Adjust p-values for multiple testing
    pvals_array = np.array(list(pvals.values()))
    adjusted_pvals = multipletests(pvals_array, method=adjust_method)[1]

    pvals = {col: adjusted_pvals[i] for i, col in enumerate(pvals.keys())}

    return {'pvals': pvals, 'stats': stats}