Hello OpenAI Team,
We are building a custom PPE compliance system where each cropped image contains a single person, and we want to classify multiple PPE attributes per image.
Each image may have multiple labels such as:
Helmet: absent / wearing properly / not wearing properly
Vest: absent /wearing properly / not wearing properly
Gloves: absent /wearing properly / not wearing properly
Glasses: absent /wearing properly / not wearing properly
Boots: absent /wearing properly / not wearing properly
For example:
Image: image1101.jpg
Helmet: absent
Vest: wearing properly
Gloves: not wearing properly
Glasses: wearing properly
Boots: wearing properly
Currently, our CSV format looks like this:
| image |
helmet |
vest |
gloves |
glass |
boots |
|
caption |
| image1101.jpg |
absent |
wearing properly |
not wearing properly |
wearing properly |
wearing properly |
|
|
caption for this image is- " Helmet: absent, Vest: wearing properly, Gloves: present but not worn properly, Glasses: wearing properly, Boots: wearing properly."
We would like guidance on:
What is the recommended data structure for multi-label image classification using OpenAI models?
Best suitable caption for this type of atsk
Are there best practices for training or fine-tuning when each image contains multiple related attributes?
I am also sharing the basic training code here for your reference:
import torch
import clip
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
--------------------
CONFIG
--------------------
train_csv = "train_corrected.csv"
val_csv = "val_corrected.csv"
image_dir = "data/images"
model_name = "clip_ppe_best.pt"
print("......",model_name)
batch_size = 8
epochs = 50
lr = 1e-5
weight_decay = 1e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
--------------------
LOAD CLIP (OpenAI)
--------------------
model, preprocess = clip.load("ViT-L/14", device=device)
model = model.float()
--------------------
FREEZE (SMALL DATA SAFE)
--------------------
for param in model.parameters():
param.requires_grad = False
Train projection heads only
model.visual.proj.requires_grad = True
model.text_projection.requires_grad = True
Freeze logit scale
model.logit_scale.requires_grad = False
model.train()
print("Context length:", model.context_length)
print("Vocab size:", model.vocab_size)
print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
--------------------
DATASET
--------------------
class PPEClipDataset(Dataset):
def init(self, csv_path, image_dir):
self.data = pd.read_csv(csv_path, encoding="latin1")
self.data["caption"] = (
self.data["caption"]
.astype(str)
.str.replace("\u00a0", " ")
.str.strip()
.str.lower()
)
self.image_dir = image_dir
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
image_path = f"{self.image_dir}/{self.data.iloc[idx]['image']}"
caption = self.data.iloc[idx]['caption']
image = Image.open(image_path).convert("RGB")
image = preprocess(image)
text = clip.tokenize([caption])[0].long()
return image, text
train_dataset = PPEClipDataset(train_csv, image_dir)
val_dataset = PPEClipDataset(val_csv, image_dir)
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2,
pin_memory=True
)
val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=2,
pin_memory=True
)
--------------------
OPTIMIZER & LOSS
--------------------
optimizer = torch.optim.AdamW(
filter(lambda p: p.requires_grad, model.parameters()),
lr=lr,
weight_decay=weight_decay
)
loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()
--------------------
VALIDATION FUNCTION
--------------------
@torch.no_grad()
def validate(model, loader):
model.eval()
total_loss = 0
batches = 0
for images, texts in loader:
images = images.to(device)
texts = texts.to(device)
logits_per_image, logits_per_text = model(images, texts)
ground_truth = torch.arange(len(images), device=device)
loss = (
loss_img(logits_per_image, ground_truth) +
loss_txt(logits_per_text, ground_truth)
) / 2
total_loss += loss.item()
batches += 1
model.train()
return total_loss / max(batches, 1)
--------------------
TRAIN LOOP
--------------------
best_val_loss = float("inf")
for epoch in range(epochs):
total_loss = 0
batches = 0
for images, texts in train_loader:
images = images.to(device)
texts = texts.to(device)
logits_per_image, logits_per_text = model(images, texts)
ground_truth = torch.arange(len(images), device=device)
loss = (
loss_img(logits_per_image, ground_truth) +
loss_txt(logits_per_text, ground_truth)
) / 2
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
model.logit_scale.data.clamp_(0, 4.6052)
total_loss += loss.item()
batches += 1
train_loss = total_loss / max(batches, 1)
val_loss = validate(model, val_loader)
print(
f"Epoch [{epoch+1}/{epochs}] "
f"Train Loss: {train_loss:.4f} | "
f"Val Loss: {val_loss:.4f}"
)
# Save best model
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(
model.state_dict(),
model_name
)
print("✅ Saved best model",model_name)
print("🎉 Training complete")
Hello OpenAI Team,
We are building a custom PPE compliance system where each cropped image contains a single person, and we want to classify multiple PPE attributes per image.
Each image may have multiple labels such as:
Helmet: absent / wearing properly / not wearing properly
Vest: absent /wearing properly / not wearing properly
Gloves: absent /wearing properly / not wearing properly
Glasses: absent /wearing properly / not wearing properly
Boots: absent /wearing properly / not wearing properly
For example:
Image: image1101.jpg
Helmet: absent
Vest: wearing properly
Gloves: not wearing properly
Glasses: wearing properly
Boots: wearing properly
Currently, our CSV format looks like this:
caption for this image is- " Helmet: absent, Vest: wearing properly, Gloves: present but not worn properly, Glasses: wearing properly, Boots: wearing properly."
We would like guidance on:
What is the recommended data structure for multi-label image classification using OpenAI models?
Best suitable caption for this type of atsk
Are there best practices for training or fine-tuning when each image contains multiple related attributes?
I am also sharing the basic training code here for your reference:
import torch
import clip
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
--------------------
CONFIG
--------------------
train_csv = "train_corrected.csv"
val_csv = "val_corrected.csv"
image_dir = "data/images"
model_name = "clip_ppe_best.pt"
print("......",model_name)
batch_size = 8
epochs = 50
lr = 1e-5
weight_decay = 1e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
--------------------
LOAD CLIP (OpenAI)
--------------------
model, preprocess = clip.load("ViT-L/14", device=device)
model = model.float()
--------------------
FREEZE (SMALL DATA SAFE)
--------------------
for param in model.parameters():
param.requires_grad = False
Train projection heads only
model.visual.proj.requires_grad = True
model.text_projection.requires_grad = True
Freeze logit scale
model.logit_scale.requires_grad = False
model.train()
print("Context length:", model.context_length)
print("Vocab size:", model.vocab_size)
print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
--------------------
DATASET
--------------------
class PPEClipDataset(Dataset):
def init(self, csv_path, image_dir):
self.data = pd.read_csv(csv_path, encoding="latin1")
train_dataset = PPEClipDataset(train_csv, image_dir)
val_dataset = PPEClipDataset(val_csv, image_dir)
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2,
pin_memory=True
)
val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=2,
pin_memory=True
)
--------------------
OPTIMIZER & LOSS
--------------------
optimizer = torch.optim.AdamW(
filter(lambda p: p.requires_grad, model.parameters()),
lr=lr,
weight_decay=weight_decay
)
loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()
--------------------
VALIDATION FUNCTION
--------------------
@torch.no_grad()
def validate(model, loader):
model.eval()
--------------------
TRAIN LOOP
--------------------
best_val_loss = float("inf")
for epoch in range(epochs):
total_loss = 0
batches = 0
print("🎉 Training complete")