Skip to content

adds uncertainty sampling #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 78 additions & 14 deletions src/active_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,23 @@
from src import detection
import pandas as pd
import geopandas as gpd
import numpy as np

def human_review(predictions, min_detection_score=0.6, min_classification_score=0.5, confident_threshold=0.5):
"""
Predict on images and divide into confident and uncertain predictions.
Args:
confident_threshold (float): The threshold for confident predictions.
min_classification_score (float, optional): The minimum class score for a prediction to be included. Defaults to 0.1.
min_detection_score (float, optional): The minimum detection score for a prediction to be included. Defaults to 0.1.
min_classification_score (float, optional): The minimum class score for a prediction to be included. Defaults to 0.5.
min_detection_score (float, optional): The minimum detection score for a prediction to be included. Defaults to 0.5.
predictions (pd.DataFrame, optional): A DataFrame of existing predictions. Defaults to None.
Returns:
tuple: A tuple of confident and uncertain predictions.
"""
"""
# Check if predictions is None or empty
if predictions is None or predictions.empty:
return pd.DataFrame(), pd.DataFrame()

filtered_predictions = predictions[
(predictions["score"] >= min_detection_score) &
(predictions["cropmodel_score"] < min_classification_score)
Expand Down Expand Up @@ -42,14 +47,15 @@ def generate_pool_predictions(pool, patch_size=512, patch_overlap=0.1, min_score
model_path (str, optional): The path to the model checkpoint file. Defaults to None. Only used in combination with dask.
dask_client (dask.distributed.Client, optional): A Dask client for parallel processing. Defaults to None.
batch_size (int, optional): The batch size for prediction. Defaults to 16.
comet_logger (CometLogger, optional): A CometLogger object. Defaults to None.
crop_model (bool, optional): A deepforest.model.CropModel object. Defaults to None.
pool_limit (int, optional): The maximum number of images to consider. Defaults to 1000.

Returns:
pd.DataFrame: A DataFrame of predictions.
"""

if pool is None:
return None

#subsample
if len(pool) > pool_limit:
pool = random.sample(pool, pool_limit)
Expand Down Expand Up @@ -97,32 +103,90 @@ def select_images(preannotations, strategy, n=10, target_labels=None, min_score=
pd.DataFrame: A DataFrame of preannotations for the chosen images.
"""

if preannotations.empty:
if preannotations is None or preannotations.empty:
return [], None

if strategy == "random":
chosen_images = random.sample(preannotations["image_path"].unique().tolist(), n)
unique_imgs = preannotations["image_path"].unique().tolist()
k = min(n, len(unique_imgs))
chosen_images = random.sample(unique_imgs, k)

else:
preannotations = preannotations[preannotations["score"] >= min_score]

if strategy == "most-detections":
# Sort images by total number of predictions
chosen_images = preannotations.groupby("image_path").size().sort_values(ascending=False).head(n).index.tolist()
chosen_images = (
preannotations
.groupby("image_path")
.size()
.sort_values(ascending=False)
.head(n)
.index
.tolist()
)

elif strategy == "target-labels":
if target_labels is None:
raise ValueError("Target labels are required for the 'target-labels' strategy.")
# Filter images by target labels
chosen_images = preannotations[preannotations.cropmodel_label.isin(target_labels)].groupby("image_path")["score"].mean().sort_values(ascending=False).head(n).index.tolist()
subset = preannotations[preannotations["cropmodel_label"].isin(target_labels)]
if subset.empty:
return [], None
chosen_images = (
subset
.groupby("image_path")["score"]
.mean()
.sort_values(ascending=False)
.head(n)
.index
.tolist()
)

elif strategy == "rarest":
# Sort images by least common label
label_counts = preannotations.groupby("cropmodel_label").size().sort_values(ascending=True)
# Sort preannoations by least common label
preannotations["label_count"] = preannotations["cropmodel_label"].map(label_counts)
preannotations.sort_values("label_count", ascending=True, inplace=True)
chosen_images = preannotations.drop_duplicates(subset=["image_path"], keep="first").head(n)["image_path"].tolist()
temp = preannotations.copy()
temp["label_count"] = temp["cropmodel_label"].map(label_counts)
temp.sort_values("label_count", ascending=True, inplace=True)
chosen_images = (
temp
.drop_duplicates(subset=["image_path"], keep="first")
.head(n)["image_path"]
.tolist()
)

elif strategy == "uncertainty":
# Images with classification scores closest to 0.5 (most uncertain)
temp = preannotations.copy()
if "cropmodel_score" in temp.columns:
temp["uncertainty_score"] = np.abs(temp["cropmodel_score"] - 0.5)
else:
temp["uncertainty_score"] = np.abs(temp["score"] - 0.5)
img_scores = temp.groupby("image_path")["uncertainty_score"].mean()
chosen_images = img_scores.nsmallest(n).index.tolist()

elif strategy == "qbc":
# Query-By-Committee: combine random + uncertainty picks
# 1) random pick
unique_imgs = preannotations["image_path"].unique().tolist()
k = min(n, len(unique_imgs))
random_imgs = random.sample(unique_imgs, k)

# 2) uncertainty pick
temp = preannotations.copy()
if "cropmodel_score" in temp.columns:
temp["uncertainty_score"] = np.abs(temp["cropmodel_score"] - 0.5)
else:
temp["uncertainty_score"] = np.abs(temp["score"] - 0.5)
img_scores = temp.groupby("image_path")["uncertainty_score"].mean()
uncertain_imgs = img_scores.nsmallest(n).index.tolist()

combined = list(dict.fromkeys(random_imgs + uncertain_imgs))
chosen_images = combined[:n]

else:
raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.")
raise ValueError(f"Invalid strategy '{strategy}'")

# Get preannotations for chosen images
chosen_preannotations = preannotations[preannotations["image_path"].isin(chosen_images)]
Expand Down
6 changes: 6 additions & 0 deletions tests/test_active_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,10 @@ def test_select_train_images(detection_model):
strategy='random',
n=1
)
train_images_to_annotate = select_images(
preannotations=train_image_pool,
strategy='qbc',
n=1
)
train_images_to_annotate
assert len(train_images_to_annotate) > 0