文件下载:

我首先下载了KaihuaTang/Scene-Graph-Benchmark.pytorch/Publicdatasets/vg网址下的这两个文件:

VG-SGG-dicts-with-attri.json
image_data.json

然后参考Scene-Graph-Benchmark.pytorch/DATASET.md,下载了数据文件VG-SGG-with-attri.h5

之后我把这些文件放到了同一个文件夹下。

预处理代码1(展示其前三条数据):

展示其前三条图片数据的代码:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import json
import sys
from typing import Any, Dict, List

try:
    import h5py
except ImportError:
    print("缺少依赖:h5py。请先安装:pip install h5py")
    sys.exit(1)


def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def coerce_int_key_map(m: Dict[Any, Any]) -> Dict[int, str]:
    res: Dict[int, str] = {}
    for k, v in m.items():
        try:
            kk = int(k)
        except Exception:
            continue
        res[kk] = str(v)
    return res


def preview_image_full(
    f: h5py.File,
    image_data: List[Dict[str, Any]],
    idx2label_maps: Dict[str, Dict[int, str]],
    img_index: int,
    limit: int,
) -> Dict[str, Any]:
    """构造一张图片的完整 groundtruth 信息字典"""

    # 基本信息
    img_info = image_data[img_index] if img_index < len(image_data) else None
    img_id = img_info.get("image_id") if img_info else None

    # box 范围
    fb = int(f["img_to_first_box"][img_index])
    lb = int(f["img_to_last_box"][img_index])
    boxes = f["boxes_512"][fb : lb + 1].tolist() if "boxes_512" in f else []
    labels = f["labels"][fb : lb + 1].flatten().tolist()
    idx2obj = idx2label_maps.get("idx_to_label", {})
    decoded_labels = [idx2obj.get(int(x), f"<unk:{x}>") for x in labels]

    # 属性
    attributes_rows = f["attributes"][fb : lb + 1].tolist() if "attributes" in f else []
    idx2attr = idx2label_maps.get("idx_to_attribute", {})
    decoded_attrs_rows = []
    for row in attributes_rows:
        decoded = [idx2attr.get(int(x), f"<unk:{x}>") for x in row if isinstance(x, int) and x > 0]
        decoded_attrs_rows.append(decoded)

    # all_boxes 输出
    all_boxes = []
    num_with_attr = 0
    for i in range(len(labels)):
        attrs = decoded_attrs_rows[i] if i < len(decoded_attrs_rows) else []
        if attrs:
            num_with_attr += 1
        all_boxes.append({
            "box": boxes[i] if i < len(boxes) else None,
            "label_idx": labels[i],
            "label": decoded_labels[i],
            "attributes": attrs
        })

    # 关系
    fr = int(f["img_to_first_rel"][img_index])
    lr = int(f["img_to_last_rel"][img_index])
    rel_pairs = f["relationships"][fr : lr + 1].tolist() if "relationships" in f else []
    rel_labels = f["predicates"][fr : lr + 1].flatten().tolist() if "predicates" in f else []
    idx2pred = idx2label_maps.get("idx_to_predicate", {})
    decoded_rels = [idx2pred.get(int(x), f"<unk:{x}>") for x in rel_labels]

    # 关系三元组解码
    triplets = []
    for i in range(len(rel_pairs)):
        subj_global, obj_global = rel_pairs[i]
        subj_local = subj_global - fb
        obj_local = obj_global - fb
        subj_name = decoded_labels[subj_local] if 0 <= subj_local < len(decoded_labels) else f"<unk:{subj_global}>"
        obj_name = decoded_labels[obj_local] if 0 <= obj_local < len(decoded_labels) else f"<unk:{obj_global}>"
        pred_name = decoded_rels[i]
        triplets.append([subj_name, pred_name, obj_name])

    # 去重
    triplets_unique = []
    seen = set()
    for t in triplets:
        key = tuple(t)
        if key not in seen:
            seen.add(key)
            triplets_unique.append(t)

    out: Dict[str, Any] = {
        "image_index": img_index,
        "image_id": img_id,
        "image_info": {
            "width": img_info.get("width") if img_info else None,
            "height": img_info.get("height") if img_info else None,
            "url": img_info.get("url") if img_info else None,
        },
        "num_boxes": len(labels),
        "num_boxes_with_attr": num_with_attr,
        "all_boxes": all_boxes[:limit],  # 只输出前 limit 个 box
        "num_relations": len(rel_labels),
        "triplets_raw": triplets[:limit],  # 原始 triplets(可能有重复)
        "triplets_unique": triplets_unique[:limit],  # 去重后的 triplets
    }

    return out


def main():
    parser = argparse.ArgumentParser(description="VG150 groundtruth 结构化输出(增强版)")
    parser.add_argument("--image_data", type=str, default="image_data.json")
    parser.add_argument("--dicts", type=str, default="VG-SGG-dicts-with-attri.json")
    parser.add_argument("--h5", type=str, default="VG-SGG-with-attri.h5")
    parser.add_argument("--num_images", type=int, default=3)
    parser.add_argument("--limit", type=int, default=5)
    args = parser.parse_args()

    image_data = load_json(args.image_data)
    dicts_data = load_json(args.dicts)

    idx2label_maps: Dict[str, Dict[int, str]] = {}
    for key in ["idx_to_label", "idx_to_predicate", "idx_to_attribute"]:
        if key in dicts_data:
            idx2label_maps[key] = coerce_int_key_map(dicts_data[key])

    with h5py.File(args.h5, "r") as f:
        from pprint import pprint
        for i in range(args.num_images):
            out = preview_image_full(f, image_data, idx2label_maps, img_index=i, limit=args.limit)
            print("\n==============================")
            print(f"## 图像索引 {i} 的完整 groundtruth")
            print("==============================")
            pprint(out, width=120)


if __name__ == "__main__":
    main()

首先运行了一下展示其前三条数据的代码,得到的结果如下:

==============================
## 图像索引 0 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': [], 'box': [256, 178, 512, 357], 'label': 'tree', 'label_idx': 136},
               {'attributes': ['brick'], 'box': [280, 289, 463, 186], 'label': 'sidewalk', 'label_idx': 114},
               {'attributes': ['brick', 'tall'], 'box': [71, 172, 143, 345], 'label': 'building', 'label_idx': 22},
               {'attributes': ['clean'], 'box': [395, 263, 230, 166], 'label': 'street', 'label_idx': 124},
               {'attributes': ['green', 'tall'], 'box': [294, 155, 50, 233], 'label': 'clock', 'label_idx': 30}],
 'image_id': 1,
 'image_index': 0,
 'image_info': {'height': 600, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg', 'width': 800},
 'num_boxes': 23,
 'num_boxes_with_attr': 15,
 'num_relations': 29,
 'triplets_raw': [['man', 'wears', 'sneaker'],
                  ['sign', 'on', 'building'],
                  ['man', 'has', 'shirt'],
                  ['sidewalk', 'near', 'street'],
                  ['man', 'has', 'glass']],
 'triplets_unique': [['man', 'wears', 'sneaker'],
                     ['sign', 'on', 'building'],
                     ['man', 'has', 'shirt'],
                     ['sidewalk', 'near', 'street'],
                     ['man', 'has', 'glass']]}

==============================
## 图像索引 1 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': ['brick', 'white'], 'box': [357, 302, 306, 162], 'label': 'sidewalk', 'label_idx': 114},
               {'attributes': ['orange', 'brown', 'tall'],
                'box': [436, 132, 146, 265],
                'label': 'building',
                'label_idx': 22},
               {'attributes': ['red', 'brown'], 'box': [191, 102, 166, 205], 'label': 'building', 'label_idx': 22},
               {'attributes': ['white', 'walking'], 'box': [249, 286, 91, 158], 'label': 'man', 'label_idx': 78},
               {'attributes': [], 'box': [286, 171, 32, 342], 'label': 'pole', 'label_idx': 99}],
 'image_id': 2,
 'image_index': 1,
 'image_info': {'height': 600, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/2.jpg', 'width': 800},
 'num_boxes': 16,
 'num_boxes_with_attr': 13,
 'num_relations': 6,
 'triplets_raw': [['building', 'has', 'window'],
                  ['building', 'has', 'window'],
                  ['building', 'has', 'window'],
                  ['building', 'has', 'window'],
                  ['building', 'has', 'window']],
 'triplets_unique': [['building', 'has', 'window'], ['bike', 'near', 'car']]}

==============================
## 图像索引 2 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': ['white', 'curved'], 'box': [306, 282, 406, 196], 'label': 'table', 'label_idx': 126},
               {'attributes': ['sitting'], 'box': [464, 130, 91, 111], 'label': 'girl', 'label_idx': 53},
               {'attributes': ['black', 'leather'], 'box': [484, 270, 53, 104], 'label': 'bag', 'label_idx': 4},
               {'attributes': ['long'], 'box': [488, 121, 45, 88], 'label': 'hair', 'label_idx': 57},
               {'attributes': [], 'box': [53, 355, 104, 35], 'label': 'drawer', 'label_idx': 39}],
 'image_id': 3,
 'image_index': 2,
 'image_info': {'height': 480, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/3.jpg', 'width': 640},
 'num_boxes': 8,
 'num_boxes_with_attr': 4,
 'num_relations': 6,
 'triplets_raw': [['girl', 'has', 'hair'],
                  ['bag', 'on', 'table'],
                  ['girl', 'has', 'hair'],
                  ['girl', 'with', 'hair'],
                  ['girl', 'with', 'hair']],
 'triplets_unique': [['girl', 'has', 'hair'],
                     ['bag', 'on', 'table'],
                     ['girl', 'with', 'hair'],
                     ['drawer', 'has', 'handle']]}

这三条数据让我们可以初步了解这个数据的情况。

预处理代码2 展示其全部数据:


#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
VG150 groundtruth 结构化导出工具 (JSON Lines 版本 v2)
- 遍历数据集中所有图片
- 将每张图片的信息保存为单行 JSON 对象,输出为 .jsonl 文件
- 优化数据结构,使其更紧凑、易于审阅
- 新增:统计并报告缺少 URL 的图像信息
"""

import argparse
import json
import sys
from typing import Any, Dict, List
from urllib.parse import urlparse
import os

try:
    import h5py
except ImportError:
    print("错误:缺少依赖 h5py。请先安装:pip install h5py")
    sys.exit(1)

try:
    from tqdm import tqdm
except ImportError:
    print("提示:缺少可选依赖 tqdm。建议安装以显示进度条:pip install tqdm")
    def tqdm(iterable, **kwargs):
        return iterable


def load_json(path: str) -> Any:
    """从指定路径加载 JSON 文件"""
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def coerce_int_key_map(m: Dict[Any, Any]) -> Dict[int, str]:
    """将字典的键强制转换为整数类型"""
    res: Dict[int, str] = {}
    for k, v in m.items():
        try:
            kk = int(k)
        except (ValueError, TypeError):
            continue
        res[kk] = str(v)
    return res


def process_image_groundtruth(
    f: h5py.File,
    image_data: List[Dict[str, Any]],
    idx2label_maps: Dict[str, Dict[int, str]],
    img_index: int,
) -> Dict[str, Any]:
    """构造一张图片的完整 groundtruth 信息字典(此函数与上一版相同)"""

    img_info = image_data[img_index] if img_index < len(image_data) else {}
    
    fb = int(f["img_to_first_box"][img_index])
    lb = int(f["img_to_last_box"][img_index])
    
    boxes = f["boxes_512"][fb : lb + 1].tolist()
    labels = f["labels"][fb : lb + 1].flatten().tolist()
    idx2obj = idx2label_maps.get("idx_to_label", {})
    decoded_labels = [idx2obj.get(x, f"<unk:{x}>") for x in labels]

    attributes_rows = f["attributes"][fb : lb + 1].tolist()
    idx2attr = idx2label_maps.get("idx_to_attribute", {})
    decoded_attrs_rows = [
        [idx2attr.get(x, f"<unk:{x}>") for x in row if x > 0]
        for row in attributes_rows
    ]

    all_boxes = []
    for i in range(len(labels)):
        all_boxes.append({
            "label": decoded_labels[i],
            "attributes": decoded_attrs_rows[i],
            "box": boxes[i],
        })

    fr = int(f["img_to_first_rel"][img_index])
    lr = int(f["img_to_last_rel"][img_index])
    rel_pairs = f["relationships"][fr : lr + 1].tolist()
    rel_labels = f["predicates"][fr : lr + 1].flatten().tolist()
    idx2pred = idx2label_maps.get("idx_to_predicate", {})
    decoded_rels = [idx2pred.get(x, f"<unk:{x}>") for x in rel_labels]

    triplets = []
    for i in range(len(rel_pairs)):
        subj_global, obj_global = rel_pairs[i]
        subj_local = subj_global - fb
        obj_local = obj_global - fb
        
        subj_name = decoded_labels[subj_local] if 0 <= subj_local < len(decoded_labels) else f"<out_of_bounds>"
        obj_name = decoded_labels[obj_local] if 0 <= obj_local < len(decoded_labels) else f"<out_of_bounds>"
        
        pred_name = decoded_rels[i]
        triplets.append([subj_name, pred_name, obj_name])

    triplets_unique_set = set(tuple(t) for t in triplets)
    triplets_unique = [list(t) for t in triplets_unique_set]

    return {
        "image_info": img_info,
        "objects": all_boxes,
        "groundtruth_triplets": triplets_unique,
    }

def main():
    parser = argparse.ArgumentParser(description="VG150 groundtruth 导出为 JSON Lines 格式 (带统计功能)")
    parser.add_argument("--image_data", type=str, default="image_data.json", help="图像元数据 JSON 文件路径")
    parser.add_argument("--dicts", type=str, default="VG-SGG-dicts-with-attri.json", help="字典 JSON 文件路径")
    parser.add_argument("--h5", type=str, default="VG-SGG-with-attri.h5", help="数据集 HDF5 文件路径")
    parser.add_argument("--output_file", type=str, default="vg150_groundtruth.jsonl", help="输出的 JSON Lines 文件路径")
    args = parser.parse_args()

    print("正在加载元数据...")
    image_data = load_json(args.image_data)
    dicts_data = load_json(args.dicts)

    idx2label_maps: Dict[str, Dict[int, str]] = {
        key: coerce_int_key_map(dicts_data[key])
        for key in ["idx_to_label", "idx_to_predicate", "idx_to_attribute"]
        if key in dicts_data
    }
    
    # === 新增:初始化统计变量 ===
    missing_url_count = 0
    images_with_missing_urls = []
    
    print(f"正在打开 HDF5 文件: {args.h5}")
    with h5py.File(args.h5, "r") as f, open(args.output_file, "w", encoding="utf-8") as out_f:
        num_images = f["img_to_first_box"].shape[0]
        print(f"发现 {num_images} 张图像,开始处理并写入到 {args.output_file}...")

        for i in tqdm(range(num_images), desc="处理图像中"):
            gt_data = process_image_groundtruth(f, image_data, idx2label_maps, img_index=i)
            
            url = gt_data["image_info"].get("url") # 使用 .get() 更安全
            
            # === 新增:统计逻辑 ===
            if not url:
                missing_url_count += 1
                images_with_missing_urls.append({
                    "index": i,
                    "image_id": gt_data["image_info"].get("image_id")
                })
            
            # (这部分逻辑保持不变)
            image_filename = os.path.basename(urlparse(url).path) if url else f"{gt_data['image_info'].get('image_id', i)}.jpg"

            final_line_object = {
                "image_id": image_filename,
                "groundtruth_triplets": gt_data["groundtruth_triplets"],
                "objects": gt_data["objects"],
                "image_info": {
                    "width": gt_data["image_info"].get("width"),
                    "height": gt_data["image_info"].get("height"),
                    "url": url,
                }
            }
            
            out_f.write(json.dumps(final_line_object, ensure_ascii=False) + "\n")

    print(f"\n处理完成!数据已成功保存到: {args.output_file}")
    
    # === 新增:打印最终的统计报告 ===
    print("\n--- 数据完整性统计报告 ---")
    if missing_url_count == 0:
        print("✅ 所有图像都包含有效的URL,数据完整性良好!")
    else:
        print(f"⚠️ 发现 {missing_url_count} 张图像缺少URL信息。")
        print("   对于这些图像,已使用其数字 image_id 生成了备用文件名 (例如 '123.jpg')。")
        print(f"   以下是前 {min(10, missing_url_count)} 个缺少URL的图像详情:")
        for item in images_with_missing_urls[:10]:
            print(f"     - 图像索引: {item['index']}, 原始数字ID: {item['image_id']}")
    print("--------------------------")


if __name__ == "__main__":
    main()

代码2预处理了全部数据,并且把这些数据保存到了vg150_groundtruth.jsonl

正在加载元数据...
正在打开 HDF5 文件: VG-SGG-with-attri.h5
发现 108073 张图像,开始处理并写入到 vg150_groundtruth.jsonl...
处理图像中: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108073/108073 [02:35<00:00, 695.98it/s] 

处理完成!数据已成功保存到: vg150_groundtruth.jsonl

--- 数据完整性统计报告 ---
✅ 所有图像都包含有效的URL,数据完整性良好!

生成的vg150_groundtruth.jsonl文件内容的前两条如下:

{"image_id": "1.jpg", "groundtruth_triplets": [["sign", "on", "building"], ["man", "has", "shirt"], ["man", "wearing", "glass"], ["bike", "behind", "man"], ["bike", "on", "sidewalk"], ["man", "has", "shoe"], ["man", "wears", "sneaker"], ["man", "in", "shirt"], ["sidewalk", "near", "street"], ["man", "wears", "pant"], ["tree", "near", "street"], ["tree", "near", "sidewalk"], ["man", "wearing", "pant"], ["man", "has", "glass"], ["man", "wearing", "shirt"], ["bike", "near", "tree"], ["man", "wearing", "shoe"], ["building", "with", "window"], ["shirt", "on", "man"], ["bike", "parked on", "sidewalk"], ["car", "parked on", "street"], ["man", "has", "pant"]], "objects": [{"label": "tree", "attributes": [], "box": [256, 178, 512, 357]}, {"label": "sidewalk", "attributes": ["brick"], "box": [280, 289, 463, 186]}, {"label": "building", "attributes": ["brick", "tall"], "box": [71, 172, 143, 345]}, {"label": "street", "attributes": ["clean"], "box": [395, 263, 230, 166]}, {"label": "clock", "attributes": ["green", "tall"], "box": [294, 155, 50, 233]}, {"label": "window", "attributes": [], "box": [447, 47, 127, 95]}, {"label": "man", "attributes": [], "box": [260, 248, 49, 160]}, {"label": "man", "attributes": [], "box": [170, 242, 38, 168]}, {"label": "sign", "attributes": ["black"], "box": [103, 64, 50, 115]}, {"label": "car", "attributes": ["white", "parked"], "box": [485, 270, 52, 105]}, {"label": "shirt", "attributes": ["grey"], "box": [260, 220, 53, 64]}, {"label": "car", "attributes": [], "box": [330, 233, 50, 61]}, {"label": "pant", "attributes": ["grey"], "box": [262, 276, 31, 82]}, {"label": "shirt", "attributes": ["orange", "red"], "box": [170, 216, 35, 66]}, {"label": "pant", "attributes": ["black"], "box": [170, 283, 29, 76]}, {"label": "shoe", "attributes": ["brown"], "box": [262, 318, 31, 18]}, {"label": "arm", "attributes": ["raised"], "box": [246, 194, 20, 27]}, {"label": "bike", "attributes": ["parked"], "box": [224, 215, 18, 24]}, {"label": "bike", "attributes": [], "box": [213, 211, 18, 27]}, {"label": "glass", "attributes": [], "box": [300, 209, 28, 15]}, {"label": "street", "attributes": ["brick"], "box": [276, 293, 457, 168]}, {"label": "sneaker", "attributes": ["grey"], "box": [171, 320, 34, 17]}, {"label": "bike", "attributes": [], "box": [217, 214, 26, 23]}], "image_info": {"width": 800, "height": 600, "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg"}}
{"image_id": "2.jpg", "groundtruth_triplets": [["building", "has", "window"], ["bike", "near", "car"]], "objects": [{"label": "sidewalk", "attributes": ["brick", "white"], "box": [357, 302, 306, 162]}, {"label": "building", "attributes": ["orange", "brown", "tall"], "box": [436, 132, 146, 265]}, {"label": "building", "attributes": ["red", "brown"], "box": [191, 102, 166, 205]}, {"label": "man", "attributes": ["white", "walking"], "box": [249, 286, 91, 158]}, {"label": "pole", "attributes": [], "box": [286, 171, 32, 342]}, {"label": "window", "attributes": ["glass"], "box": [444, 95, 61, 106]}, {"label": "car", "attributes": ["parked", "red"], "box": [193, 260, 82, 70]}, {"label": "tree", "attributes": ["green"], "box": [28, 162, 57, 141]}, {"label": "tree", "attributes": ["green"], "box": [98, 176, 68, 104]}, {"label": "tree", "attributes": ["green"], "box": [56, 164, 40, 132]}, {"label": "window", "attributes": ["glass"], "box": [442, 202, 63, 71]}, {"label": "window", "attributes": ["glass"], "box": [493, 96, 34, 104]}, {"label": "car", "attributes": ["white"], "box": [249, 235, 52, 40]}, {"label": "window", "attributes": ["glass"], "box": [495, 202, 32, 69]}, {"label": "window", "attributes": [], "box": [409, 100, 12, 109]}, {"label": "bike", "attributes": [], "box": [281, 278, 31, 41]}], "image_info": {"width": 800, "height": 600, "url": "https://cs.stanford.edu/people/rak248/VG_100K/2.jpg"}}

其中比如"image_id": "1.jpg"的这个image_id的文件名,是从这个数据中的"url": "https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg" 中得到的。

本文档涉及到的所有文件的百度网盘分享:

通过网盘分享的文件:image_data.json等4个文件
链接: https://pan.baidu.com/s/1LzAUWZDeXTi_Xmk6hqvQRg 提取码: u3g5

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐