HierCore/analyze_dbscan_clustering.py at main · DSBA-Lab/HierCore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/usr/bin/env python
"""
DBSCAN Clustering Analysis Script

Analyzes DBSCAN clustering on semantic features extracted from anomaly detection datasets.
Computes silhouette score, purity, and NMI for the clustering result.

Reads experiment config from results folder and saves analysis results there.

Usage:
    python analyze_dbscan_clustering.py --config results/MVTec/HierarchicalPatchCore/greedy0.1-layer4-dbscan/all/config.yaml
    python analyze_dbscan_clustering.py --config results/MPDD/HierarchicalPatchCore/greedy0.1-layer4-dbscan/all/config.yaml
"""

import argparse
import os
import json
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import silhouette_score, silhouette_samples, normalized_mutual_info_score
from sklearn.cluster import DBSCAN
import yaml

import backbones
from dataset import create_dataset, DatasetSplit


def load_config(config_path):
    """Load experiment config from yaml file."""
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config


def extract_semantic_features(dataloader, backbone, device, semantic_layer='layer4'):
    """Extract semantic features using specified backbone layer."""
    features_buffer = {}

    def hook_fn(module, input, output):
        features_buffer['feat'] = output

    layer = getattr(backbone, semantic_layer)
    handle = layer.register_forward_hook(hook_fn)

    all_features = []
    all_classes = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Extracting features'):
            images = batch['image'].to(device)
            classes = batch['classname']

            _ = backbone(images)

            feat = features_buffer['feat']
            feat = F.adaptive_avg_pool2d(feat, (1, 1))
            feat = feat.view(feat.size(0), -1)

            all_features.append(feat.cpu().numpy())
            all_classes.extend(classes)

    handle.remove()
    return np.vstack(all_features), all_classes


def compute_purity(clusters, labels, per_sample_sil=None):
    """Compute clustering purity.

    If per_sample_sil (array of silhouette values per sample) is provided, compute
    the mean silhouette per cluster and include it in the returned cluster_info.
    """
    unique_labels = sorted(set(labels))
    label_to_int = {l: i for i, l in enumerate(unique_labels)}
    labels_int = np.array([label_to_int[l] for l in labels])

    total_correct = 0
    cluster_info = []

    for cluster_id in np.unique(clusters):
        mask = clusters == cluster_id
        cluster_labels = labels_int[mask]

        if len(cluster_labels) == 0:
            continue

        counts = np.bincount(cluster_labels, minlength=len(unique_labels))
        dominant_count = counts.max()
        dominant_label = unique_labels[counts.argmax()]
        total_correct += dominant_count

        info = {
            'cluster_id': int(cluster_id),
            'size': int(mask.sum()),
            'dominant_class': dominant_label,
            'purity': float(dominant_count / mask.sum() * 100)
        }

        if per_sample_sil is not None:
            # average silhouette of samples in this cluster
            sil_vals = per_sample_sil[mask]
            # If cluster has only one sample, silhouette is not defined; set to NaN
            if len(sil_vals) == 0:
                info['silhouette'] = None
            else:
                info['silhouette'] = float(np.nanmean(sil_vals))
        else:
            info['silhouette'] = None

        cluster_info.append(info)

    overall_purity = total_correct / len(labels) * 100
    return overall_purity, cluster_info


def compute_nmi(clusters, labels):
    """Compute Normalized Mutual Information."""
    unique_labels = sorted(set(labels))
    label_to_int = {l: i for i, l in enumerate(unique_labels)}
    labels_int = np.array([label_to_int[l] for l in labels])
    return normalized_mutual_info_score(labels_int, clusters)


def analyze_dbscan(features, labels, eps=0.5, min_samples=5):
    """Run DBSCAN clustering and analyze results."""
    print('\nRunning DBSCAN clustering...')
    print(f'Parameters: eps={eps}, min_samples={min_samples}')

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(features)

    # Handle noise points (labeled as -1)
    noise_mask = clusters == -1
    num_noise = noise_mask.sum()
    num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)

    print(f'Number of clusters found: {num_clusters}')
    print(f'Number of noise points: {num_noise}')

    # If noise points exist, assign them to nearest cluster (similar to original code)
    if num_noise > 0:
        print(f'Assigning {num_noise} noise points to nearest clusters...')
        from sklearn.metrics.pairwise import euclidean_distances

        # Get core samples (non-noise points)
        core_indices = np.where(clusters != -1)[0]
        noise_indices = np.where(clusters == -1)[0]

        if len(core_indices) > 0:
            # Compute distances from noise points to all points
            distances = euclidean_distances(features[noise_indices], features[core_indices])
            # Find nearest core point for each noise point
            nearest_core_indices = core_indices[np.argmin(distances, axis=1)]
            # Assign cluster of nearest core point
            clusters[noise_indices] = clusters[nearest_core_indices]
        else:
            # If no core samples, assign all to cluster 0
            clusters[noise_indices] = 0

    # Recompute after noise assignment
    num_clusters = len(set(clusters))
    print(f'Final number of clusters: {num_clusters}')

    # Compute metrics
    try:
        silhouette = silhouette_score(features, clusters)
    except Exception as e:
        print(f"Warning: Could not compute silhouette score: {e}")
        silhouette = None

    purity, cluster_info = compute_purity(clusters, labels)
    nmi = compute_nmi(clusters, labels)

    print('\n' + '=' * 80)
    print('DBSCAN Clustering Results')
    print('=' * 80)
    print(f'Silhouette Score: {silhouette:.4f}' if silhouette is not None else 'Silhouette Score: N/A')
    print(f'Purity:           {purity:.2f}%')
    print(f'NMI:              {nmi:.4f}')
    print('=' * 80)

    # Per-sample silhouette for cluster info
    if silhouette is not None:
        try:
            per_sample_sil = silhouette_samples(features, clusters)
            _, cluster_info = compute_purity(clusters, labels, per_sample_sil)
        except Exception:
            per_sample_sil = None

    print(f'\nCluster Distribution:')
    print('-' * 60)
    for info in sorted(cluster_info, key=lambda x: x['cluster_id']):
        sil_str = f", silhouette: {info['silhouette']:.3f}" if info['silhouette'] is not None else ""
        print(f"  Cluster {info['cluster_id']:3d}: {info['size']:4d} samples, "
              f"dominant: {info['dominant_class']:15s}, purity: {info['purity']:.1f}%{sil_str}")

    return {
        'eps': eps,
        'min_samples': min_samples,
        'num_clusters': num_clusters,
        'num_noise': num_noise,
        'silhouette': silhouette,
        'purity': purity,
        'nmi': nmi
    }, clusters, cluster_info


def save_results(save_path, results, cluster_info, labels):
    """Save analysis results to JSON file."""
    unique_labels = sorted(set(labels))

    output = {
        'num_samples': len(labels),
        'num_ground_truth_classes': len(unique_labels),
        'ground_truth_classes': unique_labels,
        'dbscan_params': {
            'eps': float(results['eps']),
            'min_samples': int(results['min_samples'])
        },
        'results': {
            'num_clusters': int(results['num_clusters']),
            'num_noise': int(results['num_noise']),
            'silhouette': float(results['silhouette']) if results['silhouette'] is not None else None,
            'purity': float(results['purity']),
            'nmi': float(results['nmi'])
        },
        'cluster_distribution': [
            {
                'cluster_id': int(info['cluster_id']),
                'size': int(info['size']),
                'dominant_class': info['dominant_class'],
                'purity': float(info['purity']),
                'silhouette': (float(info['silhouette']) if ('silhouette' in info and info['silhouette'] is not None) else None)
            }
            for info in sorted(cluster_info, key=lambda x: x['cluster_id'])
        ]
    }

    with open(save_path, 'w') as f:
        json.dump(output, f, indent=2)

    return output


def main():
    parser = argparse.ArgumentParser(description='Analyze DBSCAN clustering on semantic features')
    parser.add_argument('--config', type=str, required=True,
                        help='Path to experiment config.yaml file')
    parser.add_argument('--device', type=str, default='cuda',
                        help='Device (cuda or cpu)')
    parser.add_argument('--eps', type=float, default=0.5,
                        help='DBSCAN eps parameter')
    parser.add_argument('--min_samples', type=int, default=5,
                        help='DBSCAN min_samples parameter')
    args = parser.parse_args()

    # Load config
    config = load_config(args.config)
    config_dir = os.path.dirname(args.config)

    # Extract parameters from config
    dataset_name = config['DATASET']['name']
    data_path = config['DATASET']['datadir']
    resize = config['DATASET']['resize']
    imagesize = config['DATASET']['imagesize']
    classname = config['DATASET']['classname']
    backbone_name = config['MODEL']['backbone']
    semantic_layer = config['MODEL'].get('semantic_layer', 'layer4')
    batch_size = config['TRAIN'].get('test_batch_size', 64)
    num_workers = config['TRAIN'].get('num_workers', 8)

    device = torch.device(args.device if torch.cuda.is_available() else 'cpu')

    print('=' * 80)
    print('DBSCAN Clustering Analysis')
    print('=' * 80)
    print(f'Config:         {args.config}')
    print(f'Dataset:        {dataset_name}')
    print(f'Data path:      {data_path}')
    print(f'Classname:      {classname}')
    print(f'Resize:         {resize}')
    print(f'Image size:     {imagesize}')
    print(f'Backbone:       {backbone_name}')
    print(f'Semantic layer: {semantic_layer}')
    print(f'Device:         {device}')
    print(f'DBSCAN eps:     {args.eps}')
    print(f'DBSCAN min_samples: {args.min_samples}')
    print()

    # Load dataset
    print('Loading dataset...')
    dataset = create_dataset(
        dataname=dataset_name,
        source=data_path,
        classname=classname,
        resize=resize,
        imagesize=imagesize,
        split=DatasetSplit.TRAIN
    )

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    print(f'Total samples: {len(dataset)}')

    # Load backbone
    print(f'\nLoading backbone ({backbone_name})...')
    backbone = backbones.load(backbone_name)
    backbone = backbone.to(device)
    backbone.eval()

    # Extract features
    print(f'\nExtracting semantic features from {semantic_layer}...')
    features, labels = extract_semantic_features(
        dataloader, backbone, device, semantic_layer
    )
    print(f'Features shape: {features.shape}')
    print(f'Unique classes: {len(set(labels))}')

    # Analyze DBSCAN clustering
    results, clusters, cluster_info = analyze_dbscan(features, labels, eps=args.eps, min_samples=args.min_samples)

    # Save results
    save_path = os.path.join(config_dir, 'dbscan_analysis.json')
    output = save_results(save_path, results, cluster_info, labels)

    print('\n' + '=' * 80)
    print(f'Results saved to: {save_path}')
    print('=' * 80)


if __name__ == '__main__':
    main()