scene graph generation 用到的vg150数据集groundtruth数据预处理,展示和保存
链接: https://pan.baidu.com/s/1cq5ftXarXwQ7B2yKf_rGZg 提取码: wvss。通过网盘分享的文件:image_data.json等4个文件。代码2预处理了全部数据,并且把这些数据保存到了。这三条数据让我们可以初步了解这个数据的情况。之后我把这些文件放到了同一个文件夹下。
scene graph generation 用到的vg150数据集groundtruth数据预处理,展示和保存
文件下载:
我首先下载了KaihuaTang/Scene-Graph-Benchmark.pytorch/Publicdatasets/vg网址下的这两个文件:
VG-SGG-dicts-with-attri.json
image_data.json
然后参考Scene-Graph-Benchmark.pytorch/DATASET.md,下载了数据文件VG-SGG-with-attri.h5 。
之后我把这些文件放到了同一个文件夹下。
预处理代码1(展示其前三条数据):
展示其前三条图片数据的代码:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import json
import sys
from typing import Any, Dict, List
try:
import h5py
except ImportError:
print("缺少依赖:h5py。请先安装:pip install h5py")
sys.exit(1)
def load_json(path: str) -> Any:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def coerce_int_key_map(m: Dict[Any, Any]) -> Dict[int, str]:
res: Dict[int, str] = {}
for k, v in m.items():
try:
kk = int(k)
except Exception:
continue
res[kk] = str(v)
return res
def preview_image_full(
f: h5py.File,
image_data: List[Dict[str, Any]],
idx2label_maps: Dict[str, Dict[int, str]],
img_index: int,
limit: int,
) -> Dict[str, Any]:
"""构造一张图片的完整 groundtruth 信息字典"""
# 基本信息
img_info = image_data[img_index] if img_index < len(image_data) else None
img_id = img_info.get("image_id") if img_info else None
# box 范围
fb = int(f["img_to_first_box"][img_index])
lb = int(f["img_to_last_box"][img_index])
boxes = f["boxes_512"][fb : lb + 1].tolist() if "boxes_512" in f else []
labels = f["labels"][fb : lb + 1].flatten().tolist()
idx2obj = idx2label_maps.get("idx_to_label", {})
decoded_labels = [idx2obj.get(int(x), f"<unk:{x}>") for x in labels]
# 属性
attributes_rows = f["attributes"][fb : lb + 1].tolist() if "attributes" in f else []
idx2attr = idx2label_maps.get("idx_to_attribute", {})
decoded_attrs_rows = []
for row in attributes_rows:
decoded = [idx2attr.get(int(x), f"<unk:{x}>") for x in row if isinstance(x, int) and x > 0]
decoded_attrs_rows.append(decoded)
# all_boxes 输出
all_boxes = []
num_with_attr = 0
for i in range(len(labels)):
attrs = decoded_attrs_rows[i] if i < len(decoded_attrs_rows) else []
if attrs:
num_with_attr += 1
all_boxes.append({
"box": boxes[i] if i < len(boxes) else None,
"label_idx": labels[i],
"label": decoded_labels[i],
"attributes": attrs
})
# 关系
fr = int(f["img_to_first_rel"][img_index])
lr = int(f["img_to_last_rel"][img_index])
rel_pairs = f["relationships"][fr : lr + 1].tolist() if "relationships" in f else []
rel_labels = f["predicates"][fr : lr + 1].flatten().tolist() if "predicates" in f else []
idx2pred = idx2label_maps.get("idx_to_predicate", {})
decoded_rels = [idx2pred.get(int(x), f"<unk:{x}>") for x in rel_labels]
# 关系三元组解码
triplets = []
for i in range(len(rel_pairs)):
subj_global, obj_global = rel_pairs[i]
subj_local = subj_global - fb
obj_local = obj_global - fb
subj_name = decoded_labels[subj_local] if 0 <= subj_local < len(decoded_labels) else f"<unk:{subj_global}>"
obj_name = decoded_labels[obj_local] if 0 <= obj_local < len(decoded_labels) else f"<unk:{obj_global}>"
pred_name = decoded_rels[i]
triplets.append([subj_name, pred_name, obj_name])
# 去重
triplets_unique = []
seen = set()
for t in triplets:
key = tuple(t)
if key not in seen:
seen.add(key)
triplets_unique.append(t)
out: Dict[str, Any] = {
"image_index": img_index,
"image_id": img_id,
"image_info": {
"width": img_info.get("width") if img_info else None,
"height": img_info.get("height") if img_info else None,
"url": img_info.get("url") if img_info else None,
},
"num_boxes": len(labels),
"num_boxes_with_attr": num_with_attr,
"all_boxes": all_boxes[:limit], # 只输出前 limit 个 box
"num_relations": len(rel_labels),
"triplets_raw": triplets[:limit], # 原始 triplets(可能有重复)
"triplets_unique": triplets_unique[:limit], # 去重后的 triplets
}
return out
def main():
parser = argparse.ArgumentParser(description="VG150 groundtruth 结构化输出(增强版)")
parser.add_argument("--image_data", type=str, default="image_data.json")
parser.add_argument("--dicts", type=str, default="VG-SGG-dicts-with-attri.json")
parser.add_argument("--h5", type=str, default="VG-SGG-with-attri.h5")
parser.add_argument("--num_images", type=int, default=3)
parser.add_argument("--limit", type=int, default=5)
args = parser.parse_args()
image_data = load_json(args.image_data)
dicts_data = load_json(args.dicts)
idx2label_maps: Dict[str, Dict[int, str]] = {}
for key in ["idx_to_label", "idx_to_predicate", "idx_to_attribute"]:
if key in dicts_data:
idx2label_maps[key] = coerce_int_key_map(dicts_data[key])
with h5py.File(args.h5, "r") as f:
from pprint import pprint
for i in range(args.num_images):
out = preview_image_full(f, image_data, idx2label_maps, img_index=i, limit=args.limit)
print("\n==============================")
print(f"## 图像索引 {i} 的完整 groundtruth")
print("==============================")
pprint(out, width=120)
if __name__ == "__main__":
main()
首先运行了一下展示其前三条数据的代码,得到的结果如下:
==============================
## 图像索引 0 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': [], 'box': [256, 178, 512, 357], 'label': 'tree', 'label_idx': 136},
{'attributes': ['brick'], 'box': [280, 289, 463, 186], 'label': 'sidewalk', 'label_idx': 114},
{'attributes': ['brick', 'tall'], 'box': [71, 172, 143, 345], 'label': 'building', 'label_idx': 22},
{'attributes': ['clean'], 'box': [395, 263, 230, 166], 'label': 'street', 'label_idx': 124},
{'attributes': ['green', 'tall'], 'box': [294, 155, 50, 233], 'label': 'clock', 'label_idx': 30}],
'image_id': 1,
'image_index': 0,
'image_info': {'height': 600, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg', 'width': 800},
'num_boxes': 23,
'num_boxes_with_attr': 15,
'num_relations': 29,
'triplets_raw': [['man', 'wears', 'sneaker'],
['sign', 'on', 'building'],
['man', 'has', 'shirt'],
['sidewalk', 'near', 'street'],
['man', 'has', 'glass']],
'triplets_unique': [['man', 'wears', 'sneaker'],
['sign', 'on', 'building'],
['man', 'has', 'shirt'],
['sidewalk', 'near', 'street'],
['man', 'has', 'glass']]}
==============================
## 图像索引 1 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': ['brick', 'white'], 'box': [357, 302, 306, 162], 'label': 'sidewalk', 'label_idx': 114},
{'attributes': ['orange', 'brown', 'tall'],
'box': [436, 132, 146, 265],
'label': 'building',
'label_idx': 22},
{'attributes': ['red', 'brown'], 'box': [191, 102, 166, 205], 'label': 'building', 'label_idx': 22},
{'attributes': ['white', 'walking'], 'box': [249, 286, 91, 158], 'label': 'man', 'label_idx': 78},
{'attributes': [], 'box': [286, 171, 32, 342], 'label': 'pole', 'label_idx': 99}],
'image_id': 2,
'image_index': 1,
'image_info': {'height': 600, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/2.jpg', 'width': 800},
'num_boxes': 16,
'num_boxes_with_attr': 13,
'num_relations': 6,
'triplets_raw': [['building', 'has', 'window'],
['building', 'has', 'window'],
['building', 'has', 'window'],
['building', 'has', 'window'],
['building', 'has', 'window']],
'triplets_unique': [['building', 'has', 'window'], ['bike', 'near', 'car']]}
==============================
## 图像索引 2 的完整 groundtruth
==============================
{'all_boxes': [{'attributes': ['white', 'curved'], 'box': [306, 282, 406, 196], 'label': 'table', 'label_idx': 126},
{'attributes': ['sitting'], 'box': [464, 130, 91, 111], 'label': 'girl', 'label_idx': 53},
{'attributes': ['black', 'leather'], 'box': [484, 270, 53, 104], 'label': 'bag', 'label_idx': 4},
{'attributes': ['long'], 'box': [488, 121, 45, 88], 'label': 'hair', 'label_idx': 57},
{'attributes': [], 'box': [53, 355, 104, 35], 'label': 'drawer', 'label_idx': 39}],
'image_id': 3,
'image_index': 2,
'image_info': {'height': 480, 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/3.jpg', 'width': 640},
'num_boxes': 8,
'num_boxes_with_attr': 4,
'num_relations': 6,
'triplets_raw': [['girl', 'has', 'hair'],
['bag', 'on', 'table'],
['girl', 'has', 'hair'],
['girl', 'with', 'hair'],
['girl', 'with', 'hair']],
'triplets_unique': [['girl', 'has', 'hair'],
['bag', 'on', 'table'],
['girl', 'with', 'hair'],
['drawer', 'has', 'handle']]}
这三条数据让我们可以初步了解这个数据的情况。
预处理代码2 展示其全部数据:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
VG150 groundtruth 结构化导出工具 (JSON Lines 版本 v2)
- 遍历数据集中所有图片
- 将每张图片的信息保存为单行 JSON 对象,输出为 .jsonl 文件
- 优化数据结构,使其更紧凑、易于审阅
- 新增:统计并报告缺少 URL 的图像信息
"""
import argparse
import json
import sys
from typing import Any, Dict, List
from urllib.parse import urlparse
import os
try:
import h5py
except ImportError:
print("错误:缺少依赖 h5py。请先安装:pip install h5py")
sys.exit(1)
try:
from tqdm import tqdm
except ImportError:
print("提示:缺少可选依赖 tqdm。建议安装以显示进度条:pip install tqdm")
def tqdm(iterable, **kwargs):
return iterable
def load_json(path: str) -> Any:
"""从指定路径加载 JSON 文件"""
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def coerce_int_key_map(m: Dict[Any, Any]) -> Dict[int, str]:
"""将字典的键强制转换为整数类型"""
res: Dict[int, str] = {}
for k, v in m.items():
try:
kk = int(k)
except (ValueError, TypeError):
continue
res[kk] = str(v)
return res
def process_image_groundtruth(
f: h5py.File,
image_data: List[Dict[str, Any]],
idx2label_maps: Dict[str, Dict[int, str]],
img_index: int,
) -> Dict[str, Any]:
"""构造一张图片的完整 groundtruth 信息字典(此函数与上一版相同)"""
img_info = image_data[img_index] if img_index < len(image_data) else {}
fb = int(f["img_to_first_box"][img_index])
lb = int(f["img_to_last_box"][img_index])
boxes = f["boxes_512"][fb : lb + 1].tolist()
labels = f["labels"][fb : lb + 1].flatten().tolist()
idx2obj = idx2label_maps.get("idx_to_label", {})
decoded_labels = [idx2obj.get(x, f"<unk:{x}>") for x in labels]
attributes_rows = f["attributes"][fb : lb + 1].tolist()
idx2attr = idx2label_maps.get("idx_to_attribute", {})
decoded_attrs_rows = [
[idx2attr.get(x, f"<unk:{x}>") for x in row if x > 0]
for row in attributes_rows
]
all_boxes = []
for i in range(len(labels)):
all_boxes.append({
"label": decoded_labels[i],
"attributes": decoded_attrs_rows[i],
"box": boxes[i],
})
fr = int(f["img_to_first_rel"][img_index])
lr = int(f["img_to_last_rel"][img_index])
rel_pairs = f["relationships"][fr : lr + 1].tolist()
rel_labels = f["predicates"][fr : lr + 1].flatten().tolist()
idx2pred = idx2label_maps.get("idx_to_predicate", {})
decoded_rels = [idx2pred.get(x, f"<unk:{x}>") for x in rel_labels]
triplets = []
for i in range(len(rel_pairs)):
subj_global, obj_global = rel_pairs[i]
subj_local = subj_global - fb
obj_local = obj_global - fb
subj_name = decoded_labels[subj_local] if 0 <= subj_local < len(decoded_labels) else f"<out_of_bounds>"
obj_name = decoded_labels[obj_local] if 0 <= obj_local < len(decoded_labels) else f"<out_of_bounds>"
pred_name = decoded_rels[i]
triplets.append([subj_name, pred_name, obj_name])
triplets_unique_set = set(tuple(t) for t in triplets)
triplets_unique = [list(t) for t in triplets_unique_set]
return {
"image_info": img_info,
"objects": all_boxes,
"groundtruth_triplets": triplets_unique,
}
def main():
parser = argparse.ArgumentParser(description="VG150 groundtruth 导出为 JSON Lines 格式 (带统计功能)")
parser.add_argument("--image_data", type=str, default="image_data.json", help="图像元数据 JSON 文件路径")
parser.add_argument("--dicts", type=str, default="VG-SGG-dicts-with-attri.json", help="字典 JSON 文件路径")
parser.add_argument("--h5", type=str, default="VG-SGG-with-attri.h5", help="数据集 HDF5 文件路径")
parser.add_argument("--output_file", type=str, default="vg150_groundtruth.jsonl", help="输出的 JSON Lines 文件路径")
args = parser.parse_args()
print("正在加载元数据...")
image_data = load_json(args.image_data)
dicts_data = load_json(args.dicts)
idx2label_maps: Dict[str, Dict[int, str]] = {
key: coerce_int_key_map(dicts_data[key])
for key in ["idx_to_label", "idx_to_predicate", "idx_to_attribute"]
if key in dicts_data
}
# === 新增:初始化统计变量 ===
missing_url_count = 0
images_with_missing_urls = []
print(f"正在打开 HDF5 文件: {args.h5}")
with h5py.File(args.h5, "r") as f, open(args.output_file, "w", encoding="utf-8") as out_f:
num_images = f["img_to_first_box"].shape[0]
print(f"发现 {num_images} 张图像,开始处理并写入到 {args.output_file}...")
for i in tqdm(range(num_images), desc="处理图像中"):
gt_data = process_image_groundtruth(f, image_data, idx2label_maps, img_index=i)
url = gt_data["image_info"].get("url") # 使用 .get() 更安全
# === 新增:统计逻辑 ===
if not url:
missing_url_count += 1
images_with_missing_urls.append({
"index": i,
"image_id": gt_data["image_info"].get("image_id")
})
# (这部分逻辑保持不变)
image_filename = os.path.basename(urlparse(url).path) if url else f"{gt_data['image_info'].get('image_id', i)}.jpg"
final_line_object = {
"image_id": image_filename,
"groundtruth_triplets": gt_data["groundtruth_triplets"],
"objects": gt_data["objects"],
"image_info": {
"width": gt_data["image_info"].get("width"),
"height": gt_data["image_info"].get("height"),
"url": url,
}
}
out_f.write(json.dumps(final_line_object, ensure_ascii=False) + "\n")
print(f"\n处理完成!数据已成功保存到: {args.output_file}")
# === 新增:打印最终的统计报告 ===
print("\n--- 数据完整性统计报告 ---")
if missing_url_count == 0:
print("✅ 所有图像都包含有效的URL,数据完整性良好!")
else:
print(f"⚠️ 发现 {missing_url_count} 张图像缺少URL信息。")
print(" 对于这些图像,已使用其数字 image_id 生成了备用文件名 (例如 '123.jpg')。")
print(f" 以下是前 {min(10, missing_url_count)} 个缺少URL的图像详情:")
for item in images_with_missing_urls[:10]:
print(f" - 图像索引: {item['index']}, 原始数字ID: {item['image_id']}")
print("--------------------------")
if __name__ == "__main__":
main()
代码2预处理了全部数据,并且把这些数据保存到了vg150_groundtruth.jsonl里
正在加载元数据...
正在打开 HDF5 文件: VG-SGG-with-attri.h5
发现 108073 张图像,开始处理并写入到 vg150_groundtruth.jsonl...
处理图像中: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108073/108073 [02:35<00:00, 695.98it/s]
处理完成!数据已成功保存到: vg150_groundtruth.jsonl
--- 数据完整性统计报告 ---
✅ 所有图像都包含有效的URL,数据完整性良好!
生成的vg150_groundtruth.jsonl文件内容的前两条如下:
{"image_id": "1.jpg", "groundtruth_triplets": [["sign", "on", "building"], ["man", "has", "shirt"], ["man", "wearing", "glass"], ["bike", "behind", "man"], ["bike", "on", "sidewalk"], ["man", "has", "shoe"], ["man", "wears", "sneaker"], ["man", "in", "shirt"], ["sidewalk", "near", "street"], ["man", "wears", "pant"], ["tree", "near", "street"], ["tree", "near", "sidewalk"], ["man", "wearing", "pant"], ["man", "has", "glass"], ["man", "wearing", "shirt"], ["bike", "near", "tree"], ["man", "wearing", "shoe"], ["building", "with", "window"], ["shirt", "on", "man"], ["bike", "parked on", "sidewalk"], ["car", "parked on", "street"], ["man", "has", "pant"]], "objects": [{"label": "tree", "attributes": [], "box": [256, 178, 512, 357]}, {"label": "sidewalk", "attributes": ["brick"], "box": [280, 289, 463, 186]}, {"label": "building", "attributes": ["brick", "tall"], "box": [71, 172, 143, 345]}, {"label": "street", "attributes": ["clean"], "box": [395, 263, 230, 166]}, {"label": "clock", "attributes": ["green", "tall"], "box": [294, 155, 50, 233]}, {"label": "window", "attributes": [], "box": [447, 47, 127, 95]}, {"label": "man", "attributes": [], "box": [260, 248, 49, 160]}, {"label": "man", "attributes": [], "box": [170, 242, 38, 168]}, {"label": "sign", "attributes": ["black"], "box": [103, 64, 50, 115]}, {"label": "car", "attributes": ["white", "parked"], "box": [485, 270, 52, 105]}, {"label": "shirt", "attributes": ["grey"], "box": [260, 220, 53, 64]}, {"label": "car", "attributes": [], "box": [330, 233, 50, 61]}, {"label": "pant", "attributes": ["grey"], "box": [262, 276, 31, 82]}, {"label": "shirt", "attributes": ["orange", "red"], "box": [170, 216, 35, 66]}, {"label": "pant", "attributes": ["black"], "box": [170, 283, 29, 76]}, {"label": "shoe", "attributes": ["brown"], "box": [262, 318, 31, 18]}, {"label": "arm", "attributes": ["raised"], "box": [246, 194, 20, 27]}, {"label": "bike", "attributes": ["parked"], "box": [224, 215, 18, 24]}, {"label": "bike", "attributes": [], "box": [213, 211, 18, 27]}, {"label": "glass", "attributes": [], "box": [300, 209, 28, 15]}, {"label": "street", "attributes": ["brick"], "box": [276, 293, 457, 168]}, {"label": "sneaker", "attributes": ["grey"], "box": [171, 320, 34, 17]}, {"label": "bike", "attributes": [], "box": [217, 214, 26, 23]}], "image_info": {"width": 800, "height": 600, "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg"}}
{"image_id": "2.jpg", "groundtruth_triplets": [["building", "has", "window"], ["bike", "near", "car"]], "objects": [{"label": "sidewalk", "attributes": ["brick", "white"], "box": [357, 302, 306, 162]}, {"label": "building", "attributes": ["orange", "brown", "tall"], "box": [436, 132, 146, 265]}, {"label": "building", "attributes": ["red", "brown"], "box": [191, 102, 166, 205]}, {"label": "man", "attributes": ["white", "walking"], "box": [249, 286, 91, 158]}, {"label": "pole", "attributes": [], "box": [286, 171, 32, 342]}, {"label": "window", "attributes": ["glass"], "box": [444, 95, 61, 106]}, {"label": "car", "attributes": ["parked", "red"], "box": [193, 260, 82, 70]}, {"label": "tree", "attributes": ["green"], "box": [28, 162, 57, 141]}, {"label": "tree", "attributes": ["green"], "box": [98, 176, 68, 104]}, {"label": "tree", "attributes": ["green"], "box": [56, 164, 40, 132]}, {"label": "window", "attributes": ["glass"], "box": [442, 202, 63, 71]}, {"label": "window", "attributes": ["glass"], "box": [493, 96, 34, 104]}, {"label": "car", "attributes": ["white"], "box": [249, 235, 52, 40]}, {"label": "window", "attributes": ["glass"], "box": [495, 202, 32, 69]}, {"label": "window", "attributes": [], "box": [409, 100, 12, 109]}, {"label": "bike", "attributes": [], "box": [281, 278, 31, 41]}], "image_info": {"width": 800, "height": 600, "url": "https://cs.stanford.edu/people/rak248/VG_100K/2.jpg"}}
其中比如"image_id": "1.jpg"的这个image_id的文件名,是从这个数据中的"url": "https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg" 中得到的。
本文档涉及到的所有文件的百度网盘分享:
通过网盘分享的文件:image_data.json等4个文件
链接: https://pan.baidu.com/s/1LzAUWZDeXTi_Xmk6hqvQRg 提取码: u3g5
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐



所有评论(0)