Spaces:

yifehuang97
/

CountEx

Sleeping

App Files Files Community

yifehuang97 commited on 29 days ago

Commit

74af434

1 Parent(s): caa3cab

init

Browse files

Files changed (8) hide show

app.py +221 -0
hf_model/CountEX.py +543 -0
hf_model/__init__.py +16 -0
hf_model/mmdet2groundingdino_swinb.py +259 -0
hf_model/mmdet2groundingdino_swinl.py +259 -0
hf_model/mmdet2groundingdino_swint.py +259 -0
hf_model/modeling_grounding_dino.py +0 -0
utils.py +455 -0

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import gradio as gr
+import torch
+from PIL import Image, ImageDraw
+from transformers import GroundingDinoProcessor
+from hf_model import CountEX
+from utils import post_process_grounded_object_detection
+# Global variables for model and processor
+model = None
+processor = None
+device = None
+def load_model():
+    """Load model and processor once at startup"""
+    global model, processor, device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load model - change path for HF Spaces
+    model_id = "BBVisual/CountEX-KC"  # Change to your HF model repo
+    model = CountEX.from_pretrained(model_id)
+    model = model.to(torch.bfloat16)
+    model = model.to(device)
+    model.eval()
+    # Load processor
+    processor_id = "fushh7/llmdet_swin_tiny_hf"
+    processor = GroundingDinoProcessor.from_pretrained(processor_id)
+    return model, processor, device
+def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
+    """
+    Main inference function for counting objects
+    Args:
+        image: Input PIL Image
+        pos_caption: Positive prompt (objects to count)
+        neg_caption: Negative prompt (objects to exclude)
+        box_threshold: Detection confidence threshold
+        point_radius: Radius of visualization points
+        point_color: Color of visualization points
+    Returns:
+        Annotated image and count
+    """
+    global model, processor, device
+    if model is None:
+        load_model()
+    # Ensure image is RGB
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    # Ensure captions end with period
+    if not pos_caption.endswith('.'):
+        pos_caption = pos_caption + '.'
+    if neg_caption and not neg_caption.endswith('.'):
+        neg_caption = neg_caption + '.'
+    # Process positive caption
+    pos_inputs = processor(
+        images=image,
+        text=pos_caption,
+        return_tensors="pt",
+        padding=True
+    )
+    pos_inputs = pos_inputs.to(device)
+    pos_inputs['pixel_values'] = pos_inputs['pixel_values'].to(torch.bfloat16)
+    # Process negative caption if provided
+    use_neg = bool(neg_caption and neg_caption.strip() and neg_caption != '.')
+    if use_neg:
+        neg_inputs = processor(
+            images=image,
+            text=neg_caption,
+            return_tensors="pt",
+            padding=True
+        )
+        neg_inputs = {k: v.to(device) for k, v in neg_inputs.items()}
+        neg_inputs['pixel_values'] = neg_inputs['pixel_values'].to(torch.bfloat16)
+        # Add negative inputs to positive inputs dict
+        pos_inputs['neg_token_type_ids'] = neg_inputs['token_type_ids']
+        pos_inputs['neg_attention_mask'] = neg_inputs['attention_mask']
+        pos_inputs['neg_pixel_mask'] = neg_inputs['pixel_mask']
+        pos_inputs['neg_pixel_values'] = neg_inputs['pixel_values']
+        pos_inputs['neg_input_ids'] = neg_inputs['input_ids']
+        pos_inputs['use_neg'] = True
+    else:
+        pos_inputs['use_neg'] = False
+    # Run inference
+    with torch.no_grad():
+        outputs = model(**pos_inputs)
+    # Post-process outputs
+    outputs["pred_points"] = outputs["pred_boxes"][:, :, :2]
+    outputs["pred_logits"] = outputs["logits"]
+    # Use custom threshold if provided, otherwise use model default
+    threshold = box_threshold if box_threshold > 0 else model.box_threshold
+    results = post_process_grounded_object_detection(outputs, box_threshold=threshold)[0]
+    # Extract points
+    boxes = results["boxes"]
+    boxes = [box.tolist() for box in boxes]
+    points = [[box[0], box[1]] for box in boxes]
+    # Visualize results
+    img_w, img_h = image.size
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+    for point in points:
+        x = point[0] * img_w
+        y = point[1] * img_h
+        draw.ellipse(
+            [x - point_radius, y - point_radius, x + point_radius, y + point_radius],
+            fill=point_color
+        )
+    count = len(points)
+    return img_draw, f"Count: {count}"
+# Create Gradio interface
+def create_demo():
+    with gr.Blocks(title="CountEx: Discriminative Visual Counting") as demo:
+        gr.Markdown("""
+        # CountEx: Discriminative Visual Counting
+        Count specific objects in images using positive and negative text prompts.
+        **Positive Prompt**: Describe what you want to count (e.g., "Green Apple")
+        **Negative Prompt**: Describe what you want to exclude (e.g., "Red Apple")
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(type="pil", label="Input Image")
+                pos_caption = gr.Textbox(
+                    label="Positive Prompt",
+                    placeholder="e.g., Green Apple",
+                    value="Green Apple"
+                )
+                neg_caption = gr.Textbox(
+                    label="Negative Prompt (optional)",
+                    placeholder="e.g., Red Apple",
+                    value="Red Apple"
+                )
+                with gr.Accordion("Advanced Settings", open=False):
+                    box_threshold = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.0,
+                        step=0.01,
+                        label="Detection Threshold (0 = use model default)"
+                    )
+                    point_radius = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=5,
+                        step=1,
+                        label="Point Radius"
+                    )
+                    point_color = gr.Dropdown(
+                        choices=["blue", "red", "green", "yellow", "cyan", "magenta", "white"],
+                        value="blue",
+                        label="Point Color"
+                    )
+                submit_btn = gr.Button("Count Objects", variant="primary")
+            with gr.Column(scale=1):
+                output_image = gr.Image(type="pil", label="Result")
+                count_output = gr.Textbox(label="Count Result")
+        # Example images
+        # gr.Examples(
+        #     examples=[
+        #         ["examples/apples.jpg", "Green Apple", "Red Apple"],
+        #         ["examples/cars.jpg", "Red Car", "Blue Car"],
+        #         ["examples/people.jpg", "Person wearing hat", "Person without hat"],
+        #     ],
+        #     inputs=[input_image, pos_caption, neg_caption],
+        #     outputs=[output_image, count_output],
+        #     fn=count_objects,
+        #     cache_examples=False,
+        # )
+        submit_btn.click(
+            fn=count_objects,
+            inputs=[input_image, pos_caption, neg_caption, box_threshold, point_radius, point_color],
+            outputs=[output_image, count_output]
+        )
+    return demo
+if __name__ == "__main__":
+    # Load model at startup
+    print("Loading model...")
+    load_model()
+    print("Model loaded!")
+    # Create and launch demo
+    demo = create_demo()
+    demo.launch()

hf_model/CountEX.py ADDED Viewed

	@@ -0,0 +1,543 @@

+# coding=utf-8
+"""
+Negative Grounding DINO Model for Object Detection with Negative Caption Support.
+This module extends the original GroundingDinoForObjectDetection to support negative captions
+for improved object detection performance.
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Tuple, Union
+from transformers.modeling_outputs import ModelOutput
+import torch.nn.functional as F
+from .modeling_grounding_dino import (
+    GroundingDinoForObjectDetection,
+    GroundingDinoObjectDetectionOutput,
+    GroundingDinoEncoderOutput,
+)
+# density_fpn_head.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def _bilinear(x, size):
+    return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
+class DensityFPNHead(nn.Module):
+    def __init__(self,
+                 in_channels: int = 512,
+                 mid_channels: int = 128,
+                 act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2d):
+        super().__init__()
+        # ---- 1×1 lateral convs (P3–P6) ----
+        self.lateral = nn.ModuleList([
+            nn.Conv2d(in_channels, mid_channels, 1) for _ in range(4)
+        ])
+        # ---- smooth convs after add ----
+        self.smooth = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(mid_channels, mid_channels, 3, padding=1, bias=False),
+                norm_layer(mid_channels),
+                act_layer(inplace=True),
+            ) for _ in range(3)          # P6→P5, P5→P4, P4→P3
+        ])
+        self.up_blocks = nn.ModuleList([
+            nn.Sequential(
+                act_layer(inplace=True),
+                nn.Conv2d(mid_channels, mid_channels, 3, padding=1, bias=False),
+                norm_layer(mid_channels),
+                act_layer(inplace=True),
+            ) for _ in range(3)          # 167×94 → … → 1336×752
+        ])
+        # ---- output 3×3 conv -> 1 ----
+        self.out_conv = nn.Conv2d(mid_channels, 1, 3, padding=1, bias=False)
+    def forward(self, feats):
+        assert len(feats) == 4, "Expect feats list = [P3,P4,P5,P6]"
+        # lateral 1×1
+        lat = [l(f) for l, f in zip(self.lateral, feats)]
+        # top-down FPN fusion
+        x = lat[-1]                              # P6
+        for i in range(3)[::-1]:                 # P5,P4,P3
+            x = _bilinear(x, lat[i].shape[-2:])
+            x = x + lat[i]
+            x = self.smooth[i](x)
+        # three-stage upsample + conv
+        for up in self.up_blocks:
+            h, w = x.shape[-2], x.shape[-1]
+            x = _bilinear(x, (h * 2, w * 2))
+            x = up(x)
+        x = self.out_conv(x)
+        return F.relu(x)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def l2norm(x, dim=-1, eps=1e-6):
+    return x / (x.norm(dim=dim, keepdim=True) + eps)
+# -----------------------------------
+# 1) CommonFinderSimple
+#    learn r "common prototypes", representing the common representative of positive/negative
+#    non fancy: only MHA pooling + two light regularizations (shareability + diversity)
+# -----------------------------------
+class CommonFinderSimple(nn.Module):
+    """
+    Inputs:
+      Q_pos: [B, K, D]
+      Q_neg: [B, K, D]
+    Returns:
+      C_rows: [B, r, D]   # batch copied r common prototypes (unitized)
+      loss:   scalar      # small regularization: shareability + diversity
+      stats:  dict
+    """
+    def __init__(self, d_model=256, r=64, nhead=4,
+                 share_w=0.02, div_w=0.02, ln_after=False):
+        super().__init__()
+        self.r = r
+        self.share_w = share_w
+        self.div_w = div_w
+        proto = torch.randn(r, d_model)
+        self.proto = nn.Parameter(l2norm(proto, -1))     # r×D learnable "core queries"
+        self.attn  = nn.MultiheadAttention(d_model, nhead, batch_first=True)
+        self.post  = nn.Linear(d_model, d_model)
+        self.ln    = nn.LayerNorm(d_model) if ln_after else nn.Identity()
+    def forward(self, Q_pos: torch.Tensor, Q_neg: torch.Tensor):
+        B, K, D = Q_pos.shape
+        seeds = self.proto[None].expand(B, -1, -1).contiguous()   # [B,r,D]
+        X = torch.cat([Q_pos, Q_neg], dim=1)                      # [B,2K,D]
+        # use seeds to do one attention pooling on positive and negative sets, get r "common prototypes"
+        C, _ = self.attn(query=seeds, key=X, value=X)             # [B,r,D]
+        C = l2norm(self.ln(self.post(C)), -1)                     # unitization
+        # ---- Simple regularization: encourage C to be close to both Q_pos and Q_neg, and diverse from each other ----
+        # Shareability: average of maximum cosine similarity between C and Q_pos/Q_neg
+        cos_pos = torch.einsum('brd,bkd->brk', C, l2norm(Q_pos, -1))  # [B,r,K]
+        cos_neg = torch.einsum('brd,bkd->brk', C, l2norm(Q_neg, -1))
+        share_term = -(cos_pos.amax(dim=-1).mean() + cos_neg.amax(dim=-1).mean())
+        # Diversity: cosine between C should not collapse
+        C0 = l2norm(self.proto, -1)                       # [r,D]
+        gram = C0 @ C0.t()                                # [r,r]
+        div_term = (gram - torch.eye(self.r, device=gram.device)).pow(2).mean()
+        loss = self.share_w * share_term + self.div_w * div_term
+        stats = {
+            'share_term': share_term.detach(),
+            'div_term':   div_term.detach(),
+            'mean_cos_pos': cos_pos.mean().detach(),
+            'mean_cos_neg': cos_neg.mean().detach()
+        }
+        return C, loss, stats
+# -----------------------------------
+# 2) NegExclusiveSimple
+#    Remove "common" information from negative queries: two simple strategies can be used independently or together
+#    (A) Soft removal: subtract the projection onto C (residual keeps non-common)
+#    (B) Filtering: only keep the Top-M negative samples least similar to C
+# -----------------------------------
+class NegExclusiveSimple(nn.Module):
+    """
+    Inputs:
+      Q_neg: [B,K,D]
+      C_rows: [B,r,D]   # common prototypes
+    Args:
+      mode: 'residual' | 'filter' | 'both'
+      M:    Top-M for 'filter'
+      thresh: Filter threshold (max_cos_neg < thresh to keep), None means only use Top-M
+    Returns:
+      neg_refs: [B, M_or_K, D]  # as negative reference (for next fusion)
+      aux: dict
+    """
+    def __init__(self, mode='residual', M=16, thresh=None):
+        super().__init__()
+        assert mode in ('residual', 'filter', 'both')
+        self.mode = mode
+        self.M = M
+        self.thresh = thresh
+    def forward(self, Q_neg: torch.Tensor, C_rows: torch.Tensor):
+        B, K, D = Q_neg.shape
+        r = C_rows.size(1)
+        Qn = l2norm(Q_neg, -1)
+        C  = l2norm(C_rows, -1)
+        sim = torch.einsum('bkd,brd->bkr', Qn, C).amax(dim=-1)   # [B,K]
+        outputs = {}
+        if self.mode in ('residual', 'both'):
+            # proj = (Q · C^T) C  -> [B,K,D]; first weight [B,K,r], then multiply C [B,r,D]
+            w = torch.einsum('bkd,brd->bkr', Qn, C)              # [B,K,r]
+            proj = torch.einsum('bkr,brd->bkd', w, C)            # [B,K,D]
+            neg_resid = l2norm(Qn - proj, -1)                    # non-common residual
+            outputs['residual'] = neg_resid
+        if self.mode in ('filter', 'both'):
+            excl_score = 1.0 - sim                               # large = away from common
+            if self.thresh is not None:
+                mask = (sim < self.thresh).float()
+                excl_score = excl_score * mask + (-1e4) * (1 - mask)
+            M = min(self.M, K)
+            topv, topi = torch.topk(excl_score, k=M, dim=1)      # [B,M]
+            neg_top = torch.gather(Qn, 1, topi.unsqueeze(-1).expand(-1, -1, D))
+            outputs['filtered'] = neg_top
+        if self.mode == 'residual':
+            neg_refs = outputs['residual']
+        elif self.mode == 'filter':
+            neg_refs = outputs['filtered']
+        else:
+            R = outputs['residual']                  # [B,K,D]
+            excl_score = 1.0 - sim
+            M = min(self.M, K)
+            topv, topi = torch.topk(excl_score, k=M, dim=1)
+            neg_refs = torch.gather(R, 1, topi.unsqueeze(-1).expand(-1, -1, D))  # [B,M,D]
+        aux = {
+            'mean_sim_to_common': sim.mean().detach(),
+            'kept_M': neg_refs.size(1)
+        }
+        return neg_refs, aux
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def l2norm(x, dim=-1, eps=1e-6):
+    return x / (x.norm(dim=dim, keepdim=True) + eps)
+class FusionNoGate(nn.Module):
+    """
+    Direct fusion (no gating): fuse neg_ref into Q_pos via one cross-attn.
+    Variants:
+      - 'residual_sub': Q_new = Q_pos - scale * LN(Z)
+      - 'residual_add': Q_new = Q_pos + scale * LN(Z)
+      - 'concat_linear': Q_new = Q_pos + Linear([Q_pos; Z])
+    """
+    def __init__(self, d_model=256, nhead=4, fusion_mode='residual_sub',
+                 init_scale=0.2, dropout_p=0.0):
+        super().__init__()
+        assert fusion_mode in ('residual_sub', 'residual_add', 'concat_linear')
+        self.fusion_mode = fusion_mode
+        self.attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
+        self.ln_z = nn.LayerNorm(d_model)
+        self.drop = nn.Dropout(dropout_p) if dropout_p > 0 else nn.Identity()
+        self.scale = nn.Parameter(torch.tensor(float(init_scale)))
+        if fusion_mode == 'concat_linear':
+            self.mix = nn.Linear(2 * d_model, d_model)
+            nn.init.zeros_(self.mix.weight)
+            nn.init.zeros_(self.mix.bias)
+    def forward(self, Q_pos: torch.Tensor, neg_ref: torch.Tensor):
+        """
+        Q_pos:   [B, K, D]
+        neg_ref: [B, M, D]
+        return:  Q_new [B, K, D], stats dict
+        """
+        B, K, D = Q_pos.shape
+        M = neg_ref.size(1)
+        if M == 0:
+            return Q_pos, {'kept': 0, 'scale': self.scale.detach()}
+        # 1) Cross-attention:
+        Z, attn_w = self.attn(query=Q_pos, key=neg_ref, value=neg_ref)  # Z:[B,K,D]
+        Z = self.ln_z(Z)
+        Z = self.drop(Z)
+        # 2) wo gating
+        if self.fusion_mode == 'residual_sub':
+            Q_new = Q_pos - self.scale * Z
+            # print("z: ", Z.sum())
+            # print(torch.abs(Q_new - Q_pos).sum())
+        elif self.fusion_mode == 'residual_add':
+            Q_new = Q_pos + self.scale * Z
+        else:  # 'concat_linear'
+            fused = torch.cat([Q_pos, Z], dim=-1)      # [B,K,2D]
+            delta = self.mix(fused)                    # [B,K,D]
+            Q_new = Q_pos + delta
+        stats = {
+            'kept': M,
+            'attn_mean': attn_w.mean().detach(),
+            'fusion_scale': self.scale.detach()
+        }
+        return Q_new, stats
+class QuerySideNegNaive(nn.Module):
+    def __init__(self, d_model=256, r=64, M=64, nhead=4,
+                 excl_mode='both', excl_thresh=0.5, gamma_max=0.7,
+                 share_w=0.02, div_w=0.02):
+        super().__init__()
+        self.common = CommonFinderSimple(d_model, r, nhead, share_w, div_w)
+        self.excl   = NegExclusiveSimple(mode=excl_mode, M=M, thresh=excl_thresh)
+        self.fuse = FusionNoGate(d_model=d_model,
+                    nhead=4,
+                    fusion_mode='residual_sub',   # or 'concat_linear'
+                    init_scale=0.25,
+                    dropout_p=0.1)
+    def forward(self, Q_pos: torch.Tensor, Q_neg: torch.Tensor):
+        C_rows, l_common, common_stats = self.common(Q_pos, Q_neg)
+        neg_refs, excl_stats = self.excl(Q_neg, C_rows)
+        Q_new, fuse_stats = self.fuse(Q_pos, neg_refs)
+        loss = l_common
+        stats = {}
+        stats.update(common_stats); stats.update(excl_stats); stats.update(fuse_stats)
+        return Q_new, loss, stats
+    def set_fusion_scale(self, scale: float):
+        del self.fuse.scale
+        self.fuse.scale = nn.Parameter(torch.tensor(scale))
+class CountEX(GroundingDinoForObjectDetection):
+    """
+    Grounding DINO Model with negative caption support for improved object detection.
+    This model extends the original GroundingDinoForObjectDetection by adding
+    support for negative captions, which helps improve detection accuracy by
+    learning what NOT to detect.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        # Initialize negative fusion modules directly in __init__
+        self.query_side_neg_pipeline = QuerySideNegNaive()
+        self.density_head = DensityFPNHead()
+        self.config = config
+        self.box_threshold = getattr(config, 'box_threshold', 0.4)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        token_type_ids: torch.LongTensor = None,
+        attention_mask: torch.LongTensor = None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None,
+        # Negative prompt parameters
+        neg_pixel_values: Optional[torch.FloatTensor] = None,
+        neg_input_ids: Optional[torch.LongTensor] = None,
+        neg_token_type_ids: Optional[torch.LongTensor] = None,
+        neg_attention_mask: Optional[torch.LongTensor] = None,
+        neg_pixel_mask: Optional[torch.BoolTensor] = None,
+        **kwargs,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_neg = kwargs.get('use_neg', True)
+        # Get positive outputs
+        pos_kwargs = {
+            'exemplars': kwargs.get('pos_exemplars', None),
+        }
+        outputs = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **pos_kwargs,
+        )
+        spatial_shapes = outputs.spatial_shapes
+        token_num = 0
+        token_num_list = [0]
+        for i in range(len(spatial_shapes)):
+            token_num += spatial_shapes[i][0] * spatial_shapes[i][1]
+            token_num_list.append(token_num.item())
+        positive_feature_maps = []
+        encoder_last_hidden_state_vision = outputs.encoder_last_hidden_state_vision
+        for i in range(len(spatial_shapes)):
+            feature_map = encoder_last_hidden_state_vision[:, token_num_list[i]:token_num_list[i+1], :]
+            spatial_shape = spatial_shapes[i]
+            b, t, d = feature_map.shape
+            feature_map = feature_map.reshape(b, spatial_shape[0], spatial_shape[1], d)
+            positive_feature_maps.append(feature_map)
+        # Get negative outputs
+        neg_kwargs = {
+            'exemplars': kwargs.get('neg_exemplars', None),
+        }
+        # print(kwargs)
+        neg_outputs = self.model(
+            pixel_values=neg_pixel_values,
+            input_ids=neg_input_ids,
+            token_type_ids=neg_token_type_ids,
+            attention_mask=neg_attention_mask,
+            pixel_mask=neg_pixel_mask,
+            encoder_outputs=encoder_outputs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **neg_kwargs,
+        )
+        neg_encoder_last_hidden_state_vision = neg_outputs.encoder_last_hidden_state_vision
+        neg_positive_feature_maps = []
+        for i in range(len(spatial_shapes)):
+            feature_map = neg_encoder_last_hidden_state_vision[:, token_num_list[i]:token_num_list[i+1], :]
+            spatial_shape = spatial_shapes[i]
+            b, t, d = feature_map.shape
+            feature_map = feature_map.reshape(b, spatial_shape[0], spatial_shape[1], d)
+            neg_positive_feature_maps.append(feature_map)
+        if return_dict:
+            hidden_states = outputs.intermediate_hidden_states
+            neg_hidden_states = neg_outputs.intermediate_hidden_states
+        else:
+            hidden_states = outputs[2]
+            neg_hidden_states = neg_outputs[2]
+        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
+        inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
+        # drop the exemplar tokens if used
+        pos_exemplars = pos_kwargs.get('pos_exemplars', None)
+        neg_exemplars = neg_kwargs.get('neg_exemplars', None)
+        if pos_exemplars is not None or neg_exemplars is not None or attention_mask.shape[1] != enc_text_hidden_state.shape[1]:
+            enc_text_hidden_state = enc_text_hidden_state[:, :enc_text_hidden_state.shape[1] - 3, :]
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+        # Apply negative fusion
+        if use_neg:
+            # print("Using negative fusions")
+            #neg_hidden_states = self.negative_semantic_extractor(neg_hidden_states)
+            #hidden_states = self.negative_fusion_module(hidden_states, neg_hidden_states)
+            hidden_states = hidden_states.squeeze(0)
+            neg_hidden_states = neg_hidden_states.squeeze(0)
+            hidden_states, extra_loss, logs = self.query_side_neg_pipeline(hidden_states, neg_hidden_states)
+            hidden_states = hidden_states.unsqueeze(0)
+            neg_hidden_states = neg_hidden_states.unsqueeze(0)
+            # print("extra_loss: ", extra_loss)
+        else:
+            # print("Not using negative fusions")
+            extra_loss = None
+            logs = None
+            # print("Not using negative fusion")
+        # print("extra_loss: ", extra_loss)
+        # predict class and bounding box deltas for each stage
+        num_levels = hidden_states.shape[1]
+        for level in range(num_levels):
+            if level == 0:
+                reference = init_reference_points
+            else:
+                reference = inter_references_points[:, level - 1]
+            reference = torch.special.logit(reference, eps=1e-5)
+            # print("hidden_states[:, level]: ", hidden_states[:, level].shape)
+            # print("enc_text_hidden_state: ", enc_text_hidden_state.shape)
+            # print("attention_mask: ", attention_mask.shape)
+            assert attention_mask.shape[1] == enc_text_hidden_state.shape[1], "Attention mask and text hidden state have different lengths: {} != {}".format(attention_mask.shape[1], enc_text_hidden_state.shape[1])
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=attention_mask.bool(),
+            )
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            reference_coordinates = reference.shape[-1]
+            if reference_coordinates == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference_coordinates == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+            return tuple_outputs
+        all_feats = []
+        for pf, npf in zip(positive_feature_maps, neg_positive_feature_maps):
+            pf = pf.permute(0, 3, 1, 2)
+            npf = npf.permute(0, 3, 1, 2)
+            all_feats.append(torch.cat([pf, npf], dim=1))
+        # pos_feat = positive_feature_maps[0].permute(0, 3, 1, 2)
+        # neg_feat = neg_positive_feature_maps[0].permute(0, 3, 1, 2)
+        # pos_minus_neg_feat = F.relu(pos_feat - neg_feat)
+        # density_feat_map = torch.cat([pos_feat, neg_feat, pos_minus_neg_feat], dim=1)
+        # density_feat_map = torch.cat([pos_feat, neg_feat], dim=1)
+        density_map_pred = self.density_head(all_feats)
+        dict_outputs = GroundingDinoObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            last_hidden_state=outputs.last_hidden_state,
+            auxiliary_outputs=auxiliary_outputs,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_vision_hidden_states=outputs.encoder_vision_hidden_states,
+            encoder_text_hidden_states=outputs.encoder_text_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            spatial_shapes=outputs.spatial_shapes,
+            positive_feature_maps=positive_feature_maps,
+            negative_feature_maps=neg_positive_feature_maps,
+            density_map_pred=density_map_pred,
+            extra_loss=extra_loss,
+            extra_logs=logs,
+        )
+        return dict_outputs

hf_model/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# coding=utf-8
+"""
+HF Model package for Grounding DINO with negative caption support.
+"""
+from .modeling_grounding_dino import (
+    GroundingDinoForObjectDetection,
+)
+from .CountEX import (
+    CountEX
+)
+__all__ = [
+    "CountEX",
+]

hf_model/mmdet2groundingdino_swinb.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# mmdet to groundingdino
+import argparse
+from collections import OrderedDict
+import torch
+from mmengine.runner import CheckpointLoader
+# convert the functions from mmdet to groundingdino
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, in_channel // 4, 4).transpose(1, 2)
+    x = x[:, [0, 2, 1, 3], :]
+    x = x.reshape(out_channel, in_channel)
+    return x
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(in_channel // 4, 4).transpose(0, 1)
+    x = x[[0, 2, 1, 3], :]
+    x = x.reshape(in_channel)
+    return x
+def convert(ckpt):
+    """Inverse mapping of checkpoint parameters to their original names."""
+    # Create a dictionary to hold the reversed checkpoint
+    new_ckpt = OrderedDict()
+    for k, v in list(ckpt.items()):
+        new_v = v  # Start with the original value
+        # Inverse rules based on the convert function (from specific to general)
+        if k.startswith('decoder'):
+            new_k = k.replace('decoder', 'transformer.decoder')
+            if 'norms.2' in new_k:
+                new_k = new_k.replace('norms.2', 'norm1')
+            if 'norms.1' in new_k:
+                new_k = new_k.replace('norms.1', 'catext_norm')
+            if 'norms.0' in new_k:
+                new_k = new_k.replace('norms.0', 'norm2')
+            if 'norms.3' in new_k:
+                new_k = new_k.replace('norms.3', 'norm3')
+            if 'cross_attn_text' in new_k:
+                new_k = new_k.replace('cross_attn_text', 'ca_text')
+                new_k = new_k.replace('attn.in_proj_weight', 'in_proj_weight')
+                new_k = new_k.replace('attn.in_proj_bias', 'in_proj_bias')
+                new_k = new_k.replace('attn.out_proj.weight', 'out_proj.weight')
+                new_k = new_k.replace('attn.out_proj.bias', 'out_proj.bias')
+            if 'ffn.layers.0.0' in new_k:
+                new_k = new_k.replace('ffn.layers.0.0', 'linear1')
+            if 'ffn.layers.1' in new_k:
+                new_k = new_k.replace('ffn.layers.1', 'linear2')
+            if 'self_attn.attn' in new_k:
+                new_k = new_k.replace('self_attn.attn', 'self_attn')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        # encoder部分最后的reg_layer_id是6，和decoder区分开来
+        elif k.startswith('bbox_head.reg_branches.6'):
+            if k.startswith('bbox_head.reg_branches.6.0'):
+                new_k = k.replace('bbox_head.reg_branches.6.0',
+                                  'transformer.enc_out_bbox_embed.layers.0')
+            if k.startswith('bbox_head.reg_branches.6.2'):
+                new_k = k.replace('bbox_head.reg_branches.6.2',
+                                  'transformer.enc_out_bbox_embed.layers.1')
+            if k.startswith('bbox_head.reg_branches.6.4'):
+                new_k = k.replace('bbox_head.reg_branches.6.4',
+                                  'transformer.enc_out_bbox_embed.layers.2')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('query_embedding'):
+            new_k = k.replace('query_embedding', 'transformer.tgt_embed')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.reg_branches'):
+            # mmdet直接省略了参数名的一部分，需要查看groundingdino的checkpoint
+            # groundingdino有两部分参数值是一致的
+            # 分别是bbox_embed和transformer.decoder.embed
+            # 所以mmdet直接将两部分参数进行了“合并”
+            reg_layer_id = int(k.split('.')[2])
+            linear_id = int(k.split('.')[3])
+            weight_or_bias = k.split('.')[-1]
+            new_k1 = 'transformer.decoder.bbox_embed.' + \
+                    str(reg_layer_id) + '.layers.' + str(linear_id // 2) + '.' + weight_or_bias
+            new_k2 = 'bbox_embed.' + \
+                     str(reg_layer_id) + '.layers.' + str(linear_id // 2) + '.' + weight_or_bias
+            new_ckpt[new_k1] = new_v  # Add the key and value to the original checkpoint dict
+            new_ckpt[new_k2] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.cls_branches.6'):
+            # mmdet在contrastive_embed中添加了bias项
+            # 但是decoder应该是0~5，所以6应该是采取两阶段微调后对应的enc_out.class_embed
+            new_k = 'transformer.enc_out_class_embed.bias'
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.cls_branches'):
+            # mmdet在contrastive_embed中添加了bias项
+            new_k1 = 'transformer.decoder.class_embed.' + k[-6:]
+            new_k2 = 'class_embed.' + k[-6:]
+            new_ckpt[new_k1] = new_v  # Add the key and value to the original checkpoint dict
+            new_ckpt[new_k2] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('memory_trans_'):
+            if k.startswith('memory_trans_fc'):
+                new_k = k.replace('memory_trans_fc', 'transformer.enc_output')
+            elif k.startswith('memory_trans_norm'):
+                new_k = k.replace('memory_trans_norm', 'transformer.enc_output_norm')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('encoder'):
+            new_k = k.replace('encoder', 'transformer.encoder')
+            new_k = new_k.replace('norms.0', 'norm1')
+            new_k = new_k.replace('norms.1', 'norm2')
+            new_k = new_k.replace('norms.2', 'norm3')
+            new_k = new_k.replace('ffn.layers.0.0', 'linear1')
+            new_k = new_k.replace('ffn.layers.1', 'linear2')
+            if 'text_layers' in new_k:
+                new_k = new_k.replace('self_attn.attn', 'self_attn')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('level_embed'):
+            new_k = k.replace('level_embed', 'transformer.level_embed')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('neck.convs'):
+            new_k = k.replace('neck.convs', 'input_proj')
+            new_k = new_k.replace('neck.extra_convs.0', 'neck.convs.3')
+            new_k = new_k.replace('conv.weight', '0.weight')
+            new_k = new_k.replace('conv.bias', '0.bias')
+            new_k = new_k.replace('gn.weight', '1.weight')
+            new_k = new_k.replace('gn.bias', '1.bias')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif 'neck.extra_convs.0' in k:
+            new_k = k.replace('neck.extra_convs.0', 'neck.convs.3')
+            new_k = new_k.replace('neck.convs', 'input_proj')
+            new_k = new_k.replace('conv.weight', '0.weight')
+            new_k = new_k.replace('conv.bias', '0.bias')
+            new_k = new_k.replace('gn.weight', '1.weight')
+            new_k = new_k.replace('gn.bias', '1.bias')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('text_feat_map'):
+            new_k = k.replace('text_feat_map', 'feat_map')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('language_model.language_backbone.body.model'):
+            new_k = k.replace('language_model.language_backbone.body.model', 'bert')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('backbone'):
+            new_k = k.replace('backbone', 'backbone.0')
+            if 'patch_embed.projection' in new_k:
+                new_k = new_k.replace('patch_embed.projection', 'patch_embed.proj')
+            elif 'drop_after_pos' in new_k:
+                new_k = new_k.replace('drop_after_pos', 'pos_drop')
+            if 'stages' in new_k:
+                new_k = new_k.replace('stages', 'layers')
+                if 'ffn.layers.0.0' in new_k:
+                    new_k = new_k.replace('ffn.layers.0.0', 'mlp.fc1')
+                elif 'ffn.layers.1' in new_k:
+                    new_k = new_k.replace('ffn.layers.1', 'mlp.fc2')
+                elif 'attn.w_msa' in new_k:
+                    new_k = new_k.replace('attn.w_msa', 'attn')
+                if 'downsample' in k:
+                    if 'reduction.' in k:
+                        new_v = correct_unfold_reduction_order(v)
+                    elif 'norm.' in k:
+                        new_v = correct_unfold_norm_order(v)
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        else:
+            print('skip:', k)
+            continue
+        # if 'transformer.decoder.bbox_embed' in new_k:
+        #     new_k = new_k.replace('transformer.decoder.bbox_embed', 'bbox_embed')
+        # if new_k.startswith('module.'):
+        #     new_k = new_k.replace('module.', '')
+    return new_ckpt
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys to GroundingDINO style.')
+    parser.add_argument(
+        'src',
+        nargs='?',
+        default='grounding_dino_swin-b_pretrain_all-f9818a7c.pth',
+        help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument(
+        'dst',
+        nargs='?',
+        default='mmdet_swinb_cogcoor.pth_groundingdino.pth',
+        help='save path')
+    args = parser.parse_args()
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    # mmdet中是state_dict而不是model
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    weight = convert(state_dict)
+    torch.save(weight, args.dst)
+    # sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    # sha = calculate_sha256(args.dst)
+    # final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    # subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {args.dst}')
+if __name__ == '__main__':
+    main()
+# skip: dn_query_generator.label_embedding.weight

hf_model/mmdet2groundingdino_swinl.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# mmdet to groundingdino
+import argparse
+from collections import OrderedDict
+import torch
+from mmengine.runner import CheckpointLoader
+# convert the functions from mmdet to groundingdino
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, in_channel // 4, 4).transpose(1, 2)
+    x = x[:, [0, 2, 1, 3], :]
+    x = x.reshape(out_channel, in_channel)
+    return x
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(in_channel // 4, 4).transpose(0, 1)
+    x = x[[0, 2, 1, 3], :]
+    x = x.reshape(in_channel)
+    return x
+def convert(ckpt):
+    """Inverse mapping of checkpoint parameters to their original names."""
+    # Create a dictionary to hold the reversed checkpoint
+    new_ckpt = OrderedDict()
+    for k, v in list(ckpt.items()):
+        new_v = v  # Start with the original value
+        # Inverse rules based on the convert function (from specific to general)
+        if k.startswith('decoder'):
+            new_k = k.replace('decoder', 'transformer.decoder')
+            if 'norms.2' in new_k:
+                new_k = new_k.replace('norms.2', 'norm1')
+            if 'norms.1' in new_k:
+                new_k = new_k.replace('norms.1', 'catext_norm')
+            if 'norms.0' in new_k:
+                new_k = new_k.replace('norms.0', 'norm2')
+            if 'norms.3' in new_k:
+                new_k = new_k.replace('norms.3', 'norm3')
+            if 'cross_attn_text' in new_k:
+                new_k = new_k.replace('cross_attn_text', 'ca_text')
+                new_k = new_k.replace('attn.in_proj_weight', 'in_proj_weight')
+                new_k = new_k.replace('attn.in_proj_bias', 'in_proj_bias')
+                new_k = new_k.replace('attn.out_proj.weight', 'out_proj.weight')
+                new_k = new_k.replace('attn.out_proj.bias', 'out_proj.bias')
+            if 'ffn.layers.0.0' in new_k:
+                new_k = new_k.replace('ffn.layers.0.0', 'linear1')
+            if 'ffn.layers.1' in new_k:
+                new_k = new_k.replace('ffn.layers.1', 'linear2')
+            if 'self_attn.attn' in new_k:
+                new_k = new_k.replace('self_attn.attn', 'self_attn')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        # encoder部分最后的reg_layer_id是6，和decoder区分开来
+        elif k.startswith('bbox_head.reg_branches.6'):
+            if k.startswith('bbox_head.reg_branches.6.0'):
+                new_k = k.replace('bbox_head.reg_branches.6.0',
+                                  'transformer.enc_out_bbox_embed.layers.0')
+            if k.startswith('bbox_head.reg_branches.6.2'):
+                new_k = k.replace('bbox_head.reg_branches.6.2',
+                                  'transformer.enc_out_bbox_embed.layers.1')
+            if k.startswith('bbox_head.reg_branches.6.4'):
+                new_k = k.replace('bbox_head.reg_branches.6.4',
+                                  'transformer.enc_out_bbox_embed.layers.2')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('query_embedding'):
+            new_k = k.replace('query_embedding', 'transformer.tgt_embed')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.reg_branches'):
+            # mmdet直接省略了参数名的一部分，需要查看groundingdino的checkpoint
+            # groundingdino有两部分参数值是一致的
+            # 分别是bbox_embed和transformer.decoder.embed
+            # 所以mmdet直接将两部分参数进行了“合并”
+            reg_layer_id = int(k.split('.')[2])
+            linear_id = int(k.split('.')[3])
+            weight_or_bias = k.split('.')[-1]
+            new_k1 = 'transformer.decoder.bbox_embed.' + \
+                    str(reg_layer_id) + '.layers.' + str(linear_id // 2) + '.' + weight_or_bias
+            new_k2 = 'bbox_embed.' + \
+                     str(reg_layer_id) + '.layers.' + str(linear_id // 2) + '.' + weight_or_bias
+            new_ckpt[new_k1] = new_v  # Add the key and value to the original checkpoint dict
+            new_ckpt[new_k2] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.cls_branches.6'):
+            # mmdet在contrastive_embed中添加了bias项
+            # 但是decoder应该是0~5，所以6应该是采取两阶段微调后对应的enc_out.class_embed
+            new_k = 'transformer.enc_out_class_embed.bias'
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.cls_branches'):
+            # mmdet在contrastive_embed中添加了bias项
+            new_k1 = 'transformer.decoder.class_embed.' + k[-6:]
+            new_k2 = 'class_embed.' + k[-6:]
+            new_ckpt[new_k1] = new_v  # Add the key and value to the original checkpoint dict
+            new_ckpt[new_k2] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('memory_trans_'):
+            if k.startswith('memory_trans_fc'):
+                new_k = k.replace('memory_trans_fc', 'transformer.enc_output')
+            elif k.startswith('memory_trans_norm'):
+                new_k = k.replace('memory_trans_norm', 'transformer.enc_output_norm')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('encoder'):
+            new_k = k.replace('encoder', 'transformer.encoder')
+            new_k = new_k.replace('norms.0', 'norm1')
+            new_k = new_k.replace('norms.1', 'norm2')
+            new_k = new_k.replace('norms.2', 'norm3')
+            new_k = new_k.replace('ffn.layers.0.0', 'linear1')
+            new_k = new_k.replace('ffn.layers.1', 'linear2')
+            if 'text_layers' in new_k:
+                new_k = new_k.replace('self_attn.attn', 'self_attn')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('level_embed'):
+            new_k = k.replace('level_embed', 'transformer.level_embed')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('neck.convs'):
+            new_k = k.replace('neck.convs', 'input_proj')
+            new_k = new_k.replace('neck.extra_convs.0', 'neck.convs.3')
+            new_k = new_k.replace('conv.weight', '0.weight')
+            new_k = new_k.replace('conv.bias', '0.bias')
+            new_k = new_k.replace('gn.weight', '1.weight')
+            new_k = new_k.replace('gn.bias', '1.bias')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif 'neck.extra_convs.0' in k:
+            new_k = k.replace('neck.extra_convs.0', 'neck.convs.4')
+            new_k = new_k.replace('neck.convs', 'input_proj')
+            new_k = new_k.replace('conv.weight', '0.weight')
+            new_k = new_k.replace('conv.bias', '0.bias')
+            new_k = new_k.replace('gn.weight', '1.weight')
+            new_k = new_k.replace('gn.bias', '1.bias')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('text_feat_map'):
+            new_k = k.replace('text_feat_map', 'feat_map')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('language_model.language_backbone.body.model'):
+            new_k = k.replace('language_model.language_backbone.body.model', 'bert')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('backbone'):
+            new_k = k.replace('backbone', 'backbone.0')
+            if 'patch_embed.projection' in new_k:
+                new_k = new_k.replace('patch_embed.projection', 'patch_embed.proj')
+            elif 'drop_after_pos' in new_k:
+                new_k = new_k.replace('drop_after_pos', 'pos_drop')
+            if 'stages' in new_k:
+                new_k = new_k.replace('stages', 'layers')
+                if 'ffn.layers.0.0' in new_k:
+                    new_k = new_k.replace('ffn.layers.0.0', 'mlp.fc1')
+                elif 'ffn.layers.1' in new_k:
+                    new_k = new_k.replace('ffn.layers.1', 'mlp.fc2')
+                elif 'attn.w_msa' in new_k:
+                    new_k = new_k.replace('attn.w_msa', 'attn')
+                if 'downsample' in k:
+                    if 'reduction.' in k:
+                        new_v = correct_unfold_reduction_order(v)
+                    elif 'norm.' in k:
+                        new_v = correct_unfold_norm_order(v)
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        else:
+            print('skip:', k)
+            continue
+        # if 'transformer.decoder.bbox_embed' in new_k:
+        #     new_k = new_k.replace('transformer.decoder.bbox_embed', 'bbox_embed')
+        # if new_k.startswith('module.'):
+        #     new_k = new_k.replace('module.', '')
+    return new_ckpt
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys to GroundingDINO style.')
+    parser.add_argument(
+        'src',
+        nargs='?',
+        default='grounding_dino_swin-l_pretrain_all-56d69e78.pth',
+        help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument(
+        'dst',
+        nargs='?',
+        default='mmdet_swinl.pth_groundingdino.pth',
+        help='save path')
+    args = parser.parse_args()
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    # mmdet中是state_dict而不是model
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    weight = convert(state_dict)
+    torch.save(weight, args.dst)
+    # sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    # sha = calculate_sha256(args.dst)
+    # final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    # subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {args.dst}')
+if __name__ == '__main__':
+    main()
+# skip: dn_query_generator.label_embedding.weight

hf_model/mmdet2groundingdino_swint.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# mmdet to groundingdino
+import argparse
+from collections import OrderedDict
+import torch
+from mmengine.runner import CheckpointLoader
+# convert the functions from mmdet to groundingdino
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, in_channel // 4, 4).transpose(1, 2)
+    x = x[:, [0, 2, 1, 3], :]
+    x = x.reshape(out_channel, in_channel)
+    return x
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(in_channel // 4, 4).transpose(0, 1)
+    x = x[[0, 2, 1, 3], :]
+    x = x.reshape(in_channel)
+    return x
+def convert(ckpt):
+    """Inverse mapping of checkpoint parameters to their original names."""
+    # Create a dictionary to hold the reversed checkpoint
+    new_ckpt = OrderedDict()
+    for k, v in list(ckpt.items()):
+        new_v = v  # Start with the original value
+        # Inverse rules based on the convert function (from specific to general)
+        if k.startswith('decoder'):
+            new_k = k.replace('decoder', 'module.transformer.decoder')
+            if 'norms.2' in new_k:
+                new_k = new_k.replace('norms.2', 'norm1')
+            if 'norms.1' in new_k:
+                new_k = new_k.replace('norms.1', 'catext_norm')
+            if 'norms.0' in new_k:
+                new_k = new_k.replace('norms.0', 'norm2')
+            if 'norms.3' in new_k:
+                new_k = new_k.replace('norms.3', 'norm3')
+            if 'cross_attn_text' in new_k:
+                new_k = new_k.replace('cross_attn_text', 'ca_text')
+                new_k = new_k.replace('attn.in_proj_weight', 'in_proj_weight')
+                new_k = new_k.replace('attn.in_proj_bias', 'in_proj_bias')
+                new_k = new_k.replace('attn.out_proj.weight', 'out_proj.weight')
+                new_k = new_k.replace('attn.out_proj.bias', 'out_proj.bias')
+            if 'ffn.layers.0.0' in new_k:
+                new_k = new_k.replace('ffn.layers.0.0', 'linear1')
+            if 'ffn.layers.1' in new_k:
+                new_k = new_k.replace('ffn.layers.1', 'linear2')
+            if 'self_attn.attn' in new_k:
+                new_k = new_k.replace('self_attn.attn', 'self_attn')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        # encoder部分最后的reg_layer_id是6，和decoder区分开来
+        elif k.startswith('bbox_head.reg_branches.6'):
+            if k.startswith('bbox_head.reg_branches.6.0'):
+                new_k = k.replace('bbox_head.reg_branches.6.0',
+                                  'module.transformer.enc_out_bbox_embed.layers.0')
+            if k.startswith('bbox_head.reg_branches.6.2'):
+                new_k = k.replace('bbox_head.reg_branches.6.2',
+                                  'module.transformer.enc_out_bbox_embed.layers.1')
+            if k.startswith('bbox_head.reg_branches.6.4'):
+                new_k = k.replace('bbox_head.reg_branches.6.4',
+                                  'module.transformer.enc_out_bbox_embed.layers.2')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('query_embedding'):
+            new_k = k.replace('query_embedding', 'module.transformer.tgt_embed')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.reg_branches'):
+            # mmdet直接省略了参数名的一部分，需要查看groundingdino的checkpoint
+            # groundingdino有两部分参数值是一致的
+            # 分别是module.bbox_embed和module.transformer.decoder.embed
+            # 所以mmdet直接将两部分参数进行了“合并”
+            reg_layer_id = int(k.split('.')[2])
+            linear_id = int(k.split('.')[3])
+            weight_or_bias = k.split('.')[-1]
+            new_k1 = 'module.transformer.decoder.bbox_embed.' + \
+                    str(reg_layer_id) + '.layers.' + str(linear_id // 2) + '.' + weight_or_bias
+            new_k2 = 'module.bbox_embed.' + \
+                     str(reg_layer_id) + '.layers.' + str(linear_id // 2) + '.' + weight_or_bias
+            new_ckpt[new_k1] = new_v  # Add the key and value to the original checkpoint dict
+            new_ckpt[new_k2] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.cls_branches.6'):
+            # mmdet在contrastive_embed中添加了bias项
+            # 但是decoder应该是0~5，所以6应该是采取两阶段微调后对应的enc_out.class_embed
+            new_k = 'module.transformer.enc_out_class_embed.bias'
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('bbox_head.cls_branches'):
+            # mmdet在contrastive_embed中添加了bias项
+            new_k1 = 'module.transformer.decoder.class_embed.' + k[-6:]
+            new_k2 = 'module.class_embed.' + k[-6:]
+            new_ckpt[new_k1] = new_v  # Add the key and value to the original checkpoint dict
+            new_ckpt[new_k2] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('memory_trans_'):
+            if k.startswith('memory_trans_fc'):
+                new_k = k.replace('memory_trans_fc', 'module.transformer.enc_output')
+            elif k.startswith('memory_trans_norm'):
+                new_k = k.replace('memory_trans_norm', 'module.transformer.enc_output_norm')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('encoder'):
+            new_k = k.replace('encoder', 'module.transformer.encoder')
+            new_k = new_k.replace('norms.0', 'norm1')
+            new_k = new_k.replace('norms.1', 'norm2')
+            new_k = new_k.replace('norms.2', 'norm3')
+            new_k = new_k.replace('ffn.layers.0.0', 'linear1')
+            new_k = new_k.replace('ffn.layers.1', 'linear2')
+            if 'text_layers' in new_k:
+                new_k = new_k.replace('self_attn.attn', 'self_attn')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('level_embed'):
+            new_k = k.replace('level_embed', 'module.transformer.level_embed')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('neck.convs'):
+            new_k = k.replace('neck.convs', 'module.input_proj')
+            new_k = new_k.replace('neck.extra_convs.0', 'neck.convs.3')
+            new_k = new_k.replace('conv.weight', '0.weight')
+            new_k = new_k.replace('conv.bias', '0.bias')
+            new_k = new_k.replace('gn.weight', '1.weight')
+            new_k = new_k.replace('gn.bias', '1.bias')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif 'neck.extra_convs.0' in k:
+            new_k = k.replace('neck.extra_convs.0', 'neck.convs.3')
+            new_k = new_k.replace('neck.convs', 'module.input_proj')
+            new_k = new_k.replace('conv.weight', '0.weight')
+            new_k = new_k.replace('conv.bias', '0.bias')
+            new_k = new_k.replace('gn.weight', '1.weight')
+            new_k = new_k.replace('gn.bias', '1.bias')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('text_feat_map'):
+            new_k = k.replace('text_feat_map', 'module.feat_map')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('language_model.language_backbone.body.model'):
+            new_k = k.replace('language_model.language_backbone.body.model', 'module.bert')
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        elif k.startswith('backbone'):
+            new_k = k.replace('backbone', 'module.backbone.0')
+            if 'patch_embed.projection' in new_k:
+                new_k = new_k.replace('patch_embed.projection', 'patch_embed.proj')
+            elif 'drop_after_pos' in new_k:
+                new_k = new_k.replace('drop_after_pos', 'pos_drop')
+            if 'stages' in new_k:
+                new_k = new_k.replace('stages', 'layers')
+                if 'ffn.layers.0.0' in new_k:
+                    new_k = new_k.replace('ffn.layers.0.0', 'mlp.fc1')
+                elif 'ffn.layers.1' in new_k:
+                    new_k = new_k.replace('ffn.layers.1', 'mlp.fc2')
+                elif 'attn.w_msa' in new_k:
+                    new_k = new_k.replace('attn.w_msa', 'attn')
+                if 'downsample' in k:
+                    if 'reduction.' in k:
+                        new_v = correct_unfold_reduction_order(v)
+                    elif 'norm.' in k:
+                        new_v = correct_unfold_norm_order(v)
+            new_ckpt[new_k] = new_v  # Add the key and value to the original checkpoint dict
+        #########################################################################
+        else:
+            print('skip:', k)
+            continue
+        # if 'module.transformer.decoder.bbox_embed' in new_k:
+        #     new_k = new_k.replace('module.transformer.decoder.bbox_embed', 'module.bbox_embed')
+        # if new_k.startswith('module'):
+        #     new_k = new_k.replace('module.', '')
+    return new_ckpt
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys to GroundingDINO style.')
+    parser.add_argument(
+        'src',
+        nargs='?',
+        default='grounding_dino_swin-t_pretrain_obj365_goldg_v3det_20231218_095741-e316e297.pth',
+        help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument(
+        'dst',
+        nargs='?',
+        default='check_mmdet_to_groundingdino.pth',
+        help='save path')
+    args = parser.parse_args()
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    # mmdet中是state_dict而不是model
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    weight = convert(state_dict)
+    torch.save(weight, args.dst)
+    # sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    # sha = calculate_sha256(args.dst)
+    # final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    # subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {args.dst}')
+if __name__ == '__main__':
+    main()
+# skip: dn_query_generator.label_embedding.weight

hf_model/modeling_grounding_dino.py ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import torch
+import numpy as np
+from transformers import GroundingDinoProcessor
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+import torch
+def prepare_targets(points, caption, shapes, emb_size, device, llmdet_processor):
+    gt_points_b = [np.array(points) / np.array(shapes)[::-1]]
+    gt_points_b[0] = gt_points_b[0].squeeze(0)
+    gt_points = [torch.from_numpy(img_points).float() for img_points in gt_points_b]
+    gt_logits = [torch.zeros((img_points.shape[0], emb_size)) for img_points in gt_points]
+    tokenized = llmdet_processor.tokenizer(caption[0], padding="longest", return_tensors="pt")
+    end_idxes = [torch.where(ids == 1012)[0][-1] for ids in tokenized['input_ids']]
+    for i, end_idx in enumerate(end_idxes):
+        gt_logits[i][:, :end_idx] = 1.0
+    caption_sizes = [idx + 2 for idx in end_idxes]
+    targets = [{"points": p.to(device), "labels": l.to(device), "caption_size": c}
+               for p, l, c in zip(gt_points, gt_logits, caption_sizes)]
+    return targets
+def post_process_grounded_object_detection(
+        outputs,
+        box_threshold: float = 0.4,
+):
+    # for the fine-tuning model, the box threshold should be set to 0.50
+    logits, boxes = outputs.logits, outputs.pred_boxes
+    probs = torch.sigmoid(logits)  # (batch_size, num_queries, 256)
+    scores = torch.max(probs, dim=-1)[0]  # (batch_size, num_queries)
+    results = []
+    for idx, (s, b, p) in enumerate(zip(scores, boxes, probs)):
+        score = s[s > box_threshold]
+        box = b[s > box_threshold]
+        prob = p[s > box_threshold]
+        results.append({"scores": score, "boxes": box})
+    return results
+class collator:
+    def __init__(self, processor=None, use_negative=True):
+        model_id = "fushh7/llmdet_swin_tiny_hf"
+        self.llmdet_processor = GroundingDinoProcessor.from_pretrained(model_id)
+        self.use_negative = use_negative
+    def __call__(self, batch):
+        # assume batch size is 1
+        example = batch[0]
+        image = example['image']
+        pil_image = example['image']
+        w, h = image.size
+        pos_caption = example['pos_caption']
+        neg_caption = example['neg_caption']
+        pos_points = example['pos_points']
+        neg_points = example['neg_points']
+        pos_count = example['pos_count']
+        neg_count = example['neg_count']
+        annotated_pos_count = example['annotated_pos_count']
+        annotated_neg_count = example['annotated_neg_count']
+        if 'type' in example:
+            sample_type = example['type']
+        else:
+            sample_type = 'eval'
+        category = example['category']
+        image_name = "{}_{}_{}_{}_{}".format(category, pos_caption, neg_caption, pos_count, neg_count)
+        pos_llm_det_inputs = self.llmdet_processor(images=image, text=pos_caption, return_tensors="pt", padding=True)
+        neg_llm_det_inputs = self.llmdet_processor(images=image, text=neg_caption, return_tensors="pt", padding=True)
+        pos_caption = [[pos_caption]]
+        neg_caption = [[neg_caption]]
+        shapes = [(w, h)]
+        pos_points = [pos_points]
+        neg_points = [neg_points]
+        # exemplars
+        if 'positive_exemplars' in example and 'negative_exemplars' in example and example[
+            'positive_exemplars'] is not None and example['negative_exemplars'] is not None:
+            pos_exemplars = example['positive_exemplars']
+            neg_exemplars = example['negative_exemplars']
+            img_height, img_width = pil_image.size
+            norm_pos_exemplars = []
+            norm_neg_exemplars = []
+            exemplar_valid = True
+            for exemplars in pos_exemplars:
+                tly, tlx, bry, brx = exemplars
+                tlx = tlx / img_width
+                tly = tly / img_height
+                brx = brx / img_width
+                bry = bry / img_height
+                if tlx < 0 or tly < 0 or tlx > 1.0 or tly > 1.0:
+                    exemplar_valid = False
+                if brx < 0 or bry < 0 or brx > 1.0 or bry > 1.0:
+                    exemplar_valid = False
+                if tlx >= brx or tly >= bry:
+                    exemplar_valid = False
+                tlx = max(tlx, 0)
+                tly = max(tly, 0)
+                tly = min(tly, 1 - 1e-4)
+                tlx = min(tlx, 1 - 1e-4)
+                brx = min(brx, 1)
+                bry = min(bry, 1)
+                brx = max(brx, tlx)
+                bry = max(bry, tly)
+                assert tlx >= 0 and tly >= 0 and brx <= 1 and bry <= 1 and tlx <= brx and tly <= bry, f"tlx: {tlx}, tly: {tly}, brx: {brx}, bry: {bry}"
+                norm_pos_exemplars.append([tlx, tly, brx, bry])
+            for exemplars in neg_exemplars:
+                tly, tlx, bry, brx = exemplars
+                tlx = tlx / img_width
+                tly = tly / img_height
+                brx = brx / img_width
+                bry = bry / img_height
+                if tlx < 0 or tly < 0 or tlx > 1.0 or tly > 1.0:
+                    exemplar_valid = False
+                if brx < 0 or bry < 0 or brx > 1.0 or bry > 1.0:
+                    exemplar_valid = False
+                if tlx >= brx or tly >= bry:
+                    exemplar_valid = False
+                tlx = max(tlx, 0)
+                tly = max(tly, 0)
+                tly = min(tly, 1 - 1e-4)
+                tlx = min(tlx, 1 - 1e-4)
+                brx = min(brx, 1)
+                bry = min(bry, 1)
+                brx = max(brx, tlx)
+                bry = max(bry, tly)
+                assert tlx >= 0 and tly >= 0 and brx <= 1 and bry <= 1 and tlx <= brx and tly <= bry, f"tlx: {tlx}, tly: {tly}, brx: {brx}, bry: {bry}"
+                norm_neg_exemplars.append([tlx, tly, brx, bry])
+            if exemplar_valid:
+                pos_exemplars = [torch.from_numpy(np.array(exemplars)).float() for exemplars in norm_pos_exemplars]
+                neg_exemplars = [torch.from_numpy(np.array(exemplars)).float() for exemplars in norm_neg_exemplars]
+                pos_exemplars = torch.stack(pos_exemplars)
+                neg_exemplars = torch.stack(neg_exemplars)
+                batch_dict = {
+                    'pos_llm_det_inputs': pos_llm_det_inputs,
+                    'neg_llm_det_inputs': neg_llm_det_inputs,
+                    'pos_caption': pos_caption,
+                    'neg_caption': neg_caption,
+                    'shapes': shapes,
+                    'pos_points': pos_points,
+                    'neg_points': neg_points,
+                    'pos_count': pos_count,
+                    'neg_count': neg_count,
+                    'annotated_pos_count': annotated_pos_count,
+                    'annotated_neg_count': annotated_neg_count,
+                    'image': pil_image,
+                    'category': category,
+                    'type': sample_type,
+                    'pos_exemplars': pos_exemplars,
+                    'neg_exemplars': neg_exemplars,
+                    'image_name': image_name,
+                }
+            else:
+                batch_dict = {
+                    'pos_llm_det_inputs': pos_llm_det_inputs,
+                    'neg_llm_det_inputs': neg_llm_det_inputs,
+                    'pos_caption': pos_caption,
+                    'neg_caption': neg_caption,
+                    'shapes': shapes,
+                    'pos_points': pos_points,
+                    'neg_points': neg_points,
+                    'pos_count': pos_count,
+                    'neg_count': neg_count,
+                    'annotated_pos_count': annotated_pos_count,
+                    'annotated_neg_count': annotated_neg_count,
+                    'image': pil_image,
+                    'category': category,
+                    'type': sample_type,
+                    'image_name': image_name,
+                }
+        else:
+            batch_dict = {
+                'pos_llm_det_inputs': pos_llm_det_inputs,
+                'neg_llm_det_inputs': neg_llm_det_inputs,
+                'pos_caption': pos_caption,
+                'neg_caption': neg_caption,
+                'shapes': shapes,
+                'pos_points': pos_points,
+                'neg_points': neg_points,
+                'pos_count': pos_count,
+                'neg_count': neg_count,
+                'annotated_pos_count': annotated_pos_count,
+                'annotated_neg_count': annotated_neg_count,
+                'image': pil_image,
+                'category': category,
+                'type': sample_type,
+                'image_name': image_name,
+            }
+        return batch_dict
+import torch.distributed as dist
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+def build_dataset(data_args):
+    from datasets import load_from_disk, concatenate_datasets
+    categories = ["FOO", "FUN", "OFF", "OTR", "HOU"]
+    if data_args.data_split not in categories:
+        rank0_print(f"Warning: Invalid data_split '{data_args.data_split}'. Switching to 'all' mode.")
+        data_args.data_split = "all"
+    if data_args.data_split == "all":
+        train_dataset = load_from_disk(data_args.train_data_path)
+        train_dataset = concatenate_datasets(
+            [train_dataset["FOO"], train_dataset["FUN"], train_dataset["OFF"], train_dataset["OTR"],
+             train_dataset["HOU"]])
+        val_dataset = load_from_disk(data_args.val_data_path)
+        val_dataset = concatenate_datasets(
+            [val_dataset["FOO"], val_dataset["FUN"], val_dataset["OFF"], val_dataset["OTR"], val_dataset["HOU"]])
+        test_dataset = load_from_disk(data_args.test_data_path)
+        test_dataset = concatenate_datasets(
+            [test_dataset["FOO"], test_dataset["FUN"], test_dataset["OFF"], test_dataset["OTR"], test_dataset["HOU"]])
+        weakly_supervised_data = load_from_disk(data_args.weakly_supervised_data_path)
+        weakly_supervised_data = concatenate_datasets(
+            [weakly_supervised_data["FOO"], weakly_supervised_data["FUN"], weakly_supervised_data["OFF"],
+             weakly_supervised_data["OTR"], weakly_supervised_data["HOU"]])
+        rank0_print("Using 'all' mode: all categories for train/val/test")
+    else:
+        test_category = data_args.data_split
+        train_categories = [cat for cat in categories if cat != test_category]
+        train_dataset = load_from_disk(data_args.train_data_path)
+        print(train_categories, train_dataset.keys())
+        train_datasets = [train_dataset[cat] for cat in train_categories]
+        train_dataset = concatenate_datasets(train_datasets)
+        weakly_supervised_data = load_from_disk(data_args.weakly_supervised_data_path)
+        weakly_supervised_data = [weakly_supervised_data[cat] for cat in train_categories]
+        weakly_supervised_data = concatenate_datasets(weakly_supervised_data)
+        val_dataset = load_from_disk(data_args.val_data_path)
+        val_dataset = val_dataset[test_category]
+        test_dataset = load_from_disk(data_args.test_data_path)
+        test_dataset = test_dataset[test_category]
+        rank0_print(f"Cross-validation mode: using {train_categories} for train, {test_category} for val/test")
+    rank0_print('train_dataset: ', len(train_dataset))
+    rank0_print('val_dataset: ', len(val_dataset))
+    rank0_print('test_dataset: ', len(test_dataset))
+    rank0_print('weakly_supervised_data: ', len(weakly_supervised_data))
+    return train_dataset, val_dataset, test_dataset, weakly_supervised_data
+def generate_pseudo_density_map(points_norm: torch.Tensor,
+                                output_size: tuple[int, int],
+                                sigma: float = 4.0,
+                                normalize: bool = True) -> torch.Tensor:
+    device = points_norm.device
+    H, W = output_size
+    N = points_norm.shape[0]
+    ys = torch.arange(H, device=device).float()
+    xs = torch.arange(W, device=device).float()
+    grid_y, grid_x = torch.meshgrid(ys, xs, indexing='ij')  # (H, W)
+    pts_px = points_norm.clone()
+    pts_px[:, 0] *= (W - 1)  # x
+    pts_px[:, 1] *= (H - 1)  # y
+    dx = grid_x.unsqueeze(0) - pts_px[:, 0].view(-1, 1, 1)  # (N, H, W)
+    dy = grid_y.unsqueeze(0) - pts_px[:, 1].view(-1, 1, 1)  # (N, H, W)
+    dist2 = dx ** 2 + dy ** 2
+    gaussians = torch.exp(-dist2 / (2 * sigma ** 2))  # (N, H, W)
+    density_map = gaussians.sum(dim=0, keepdim=True)  # (1, H, W)
+    if normalize and N > 0:
+        density_map = density_map * (N / density_map.sum())
+    return density_map.unsqueeze(0)
+def show_density_map(density_map: torch.Tensor,
+                     points_norm: torch.Tensor | None = None,
+                     figsize: tuple[int, int] = (6, 8),
+                     cmap: str = "jet") -> None:
+    dm = density_map.squeeze().detach().cpu().numpy()  # (H, W)
+    H, W = dm.shape
+    plt.figure(figsize=figsize)
+    plt.imshow(dm, cmap=cmap, origin="upper")
+    plt.colorbar(label="Density")
+    if points_norm is not None and points_norm.numel() > 0:
+        pts = points_norm.detach().cpu().numpy()
+        xs = pts[:, 0] * (W - 1)
+        ys = pts[:, 1] * (H - 1)
+        plt.scatter(xs, ys, c="white", s=12, edgecolors="black", linewidths=0.5)
+    plt.title(f"Density map  (sum = {dm.sum():.2f})")
+    plt.axis("off")
+    plt.tight_layout()
+    plt.show()
+def show_image_with_density(pil_img: Image.Image,
+                            density_map: torch.Tensor,
+                            points_norm: torch.Tensor | None = None,
+                            cmap: str = "jet",
+                            alpha: float = 0.45,
+                            figsize: tuple[int, int] = (6, 8)) -> None:
+    dm = density_map.squeeze().detach().cpu().numpy()  # (H, W)
+    H, W = dm.shape
+    img_resized = pil_img.resize((W, H), Image.BILINEAR)  # or LANCZOS
+    img_np = np.asarray(img_resized)
+    plt.figure(figsize=figsize)
+    plt.imshow(img_np, origin="upper")
+    plt.imshow(dm, cmap=cmap, alpha=alpha, origin="upper")
+    if points_norm is not None and points_norm.numel() > 0:
+        pts = points_norm.detach().cpu().numpy()
+        xs = pts[:, 0] * (W - 1)
+        ys = pts[:, 1] * (H - 1)
+        plt.scatter(xs, ys, c="white", s=12, edgecolors="black", linewidths=0.5)
+    plt.title(f"Overlay (density sum = {dm.sum():.2f})")
+    plt.axis("off")
+    plt.tight_layout()
+    plt.show()
+def build_point_count_map(feat_maps: torch.Tensor,
+                          pts_norm_list: list[torch.Tensor]) -> torch.Tensor:
+    assert feat_maps.dim() == 4, "expect NHWC: (B,H,W,D)"
+    B, H, W, _ = feat_maps.shape
+    device = feat_maps.device
+    count_map = torch.zeros((B, H, W), dtype=torch.float32, device=device)
+    for b in range(B):
+        pts = pts_norm_list[b].to(device).float()  # (Ni, 2)
+        if pts.numel() == 0:
+            continue
+        idx_xy = (pts * torch.tensor([W, H], device=device)).long()
+        idx_xy[..., 0].clamp_(0, W - 1)  # x
+        idx_xy[..., 1].clamp_(0, H - 1)  # y
+        lin_idx = idx_xy[:, 1] * W + idx_xy[:, 0]  # (Ni,)
+        one = torch.ones_like(lin_idx, dtype=torch.float32)
+        flat = torch.zeros(H * W, dtype=torch.float32, device=device)
+        flat.scatter_add_(0, lin_idx, one)
+        count_map[b] = flat.view(H, W)
+    return count_map
+import torch
+import torch.nn.functional as F
+def extract_pos_tokens_single(feat_maps: torch.Tensor,
+                              count_map: torch.Tensor):
+    assert feat_maps.dim() == 4 and count_map.dim() == 3, "维度应为 (B,H,W,D) / (B,H,W)"
+    B, H, W, D = feat_maps.shape
+    assert B == 1, "当前函数假设 batch_size == 1"
+    feat = feat_maps[0]  # (H,W,D)
+    cnt = count_map[0]  # (H,W)
+    pos_mask = cnt > 0  # Bool (H,W)
+    if pos_mask.sum() == 0:
+        empty = torch.empty(0, device=feat.device)
+        return empty.reshape(0, D), empty.long()
+    pos_tokens = feat[pos_mask]  # (N_pos, D)
+    y_idx, x_idx = torch.nonzero(pos_mask, as_tuple=True)
+    lin_index = y_idx * W + x_idx  # (N_pos,)
+    return pos_tokens, lin_index
+def filter_overlap(pos_tok, lin_pos, neg_tok, lin_neg):
+    pos_only_mask = ~torch.isin(lin_pos, lin_neg)
+    neg_only_mask = ~torch.isin(lin_neg, lin_pos)
+    return pos_tok[pos_only_mask], neg_tok[neg_only_mask]
+# ------------------------------------------------------------
+# 2) supervised contrastive loss
+# ------------------------------------------------------------
+def supcon_pos_neg(pos_tokens, neg_tokens, temperature=0.07):
+    """
+    pos_tokens : (Np, D)  Pos token
+    neg_tokens : (Nn, D)  Neg token
+    """
+    if pos_tokens.numel() == 0 or neg_tokens.numel() == 0:
+        return torch.tensor(0., device=pos_tokens.device, requires_grad=True)
+    pos_tokens = F.normalize(pos_tokens, dim=-1)
+    neg_tokens = F.normalize(neg_tokens, dim=-1)
+    feats = torch.cat([pos_tokens, neg_tokens], dim=0)  # (N, D)
+    labels = torch.cat([torch.zeros(len(pos_tokens), device=feats.device, dtype=torch.long),
+                        torch.ones(len(neg_tokens), device=feats.device, dtype=torch.long)], dim=0)  # (N,)
+    logits = feats @ feats.T / temperature  # (N, N)
+    logits.fill_diagonal_(-1e4)
+    mask_pos = labels.unsqueeze(0) == labels.unsqueeze(1)  # (N, N)
+    mask_pos.fill_diagonal_(False)
+    exp_logits = logits.exp()
+    denom = exp_logits.sum(dim=1, keepdim=True)  # Σ_{a≠i} exp
+    log_prob = logits - denom.log()  # log softmax
+    loss_i = -(mask_pos * log_prob).sum(1) / mask_pos.sum(1).clamp_min(1)
+    loss = loss_i.mean()
+    return loss
+def build_point_count_map(feat_maps: torch.Tensor,
+                          pts_norm_list: list[torch.Tensor]) -> torch.Tensor:
+    assert feat_maps.dim() == 4, "expect NHWC: (B,H,W,D)"
+    B, H, W, _ = feat_maps.shape
+    device = feat_maps.device
+    count_map = torch.zeros((B, H, W), dtype=torch.float32, device=device)
+    for b in range(B):
+        pts = pts_norm_list[b].to(device).float()  # (Ni, 2)
+        if pts.numel() == 0:
+            continue
+        idx_xy = (pts * torch.tensor([W, H], device=device)).long()
+        idx_xy[..., 0].clamp_(0, W - 1)  # x
+        idx_xy[..., 1].clamp_(0, H - 1)  # y
+        lin_idx = idx_xy[:, 1] * W + idx_xy[:, 0]  # (Ni,)
+        one = torch.ones_like(lin_idx, dtype=torch.float32)
+        flat = torch.zeros(H * W, dtype=torch.float32, device=device)
+        flat.scatter_add_(0, lin_idx, one)
+        count_map[b] = flat.view(H, W)
+    return count_map