抽取COCO数据集中部分类别并重新序列化类别标签
抽取COCO数据集中指定类别并重新序列化类别标签
·
首先准备好:COCO数据集
点击下载 COCO train2017 数据集
点击下载 COCO val2017 数据集
(1)images->train2017,原始coco数据集图片文件
(2)images->train,用于存放包含指定目标类别的图片
(3)labels>train2017,原始coco数据集标签文件
(4)labels->train,用于存放包含指定目标类别的图片对应的标签文件
要修改原始标签、目标标签文件路径及想抽取的目标类别序号,并请注意(代码会直接对文件夹中不包含目标类别的图片进行删除操作,不可恢复)。完整代码如下:
# coding=utf-8
import os
import re
import shutil
# 给定文件夹,返回存在指定类别列表中任一类别的文件名
def get_class_exist_path(l_path, classes):
result = []
for root, dirs, files in os.walk(l_path):
total = len(files)
for i, file in enumerate(files):
f_path = os.path.join(root, file)
with open(f_path) as f:
for line in f.readlines():
items = re.split(r"[ ]+", line.strip())
if items[0] in classes:
result.append(file)
break
print("检查类别进度: %d/%d" % (i + 1, total))
return result
def move_txt_files(l_path, e_fs, dst_d_path):
total = len(e_fs)
for i, e_f in enumerate(e_fs):
src = os.path.join(l_path, e_f)
dst = os.path.join(dst_d_path, e_f)
shutil.move(src, dst)
print("移动文本文件: %d/%d" % (i+1, total))
def delete_other_class(dist_l_path, classes):
for root, dirs, files in os.walk(dist_l_path):
total = len(files)
for i, file in enumerate(files):
f_path = os.path.join(dist_l_path, file)
with open(f_path, 'r') as f:
lines = f.readlines()
with open(f_path, 'w') as f_w:
for line in lines:
items = re.split(r"[ ]+", line.strip())
if items[0] in classes:
f_w.write(line)
print("删除其他类别进度: %d/%d" % (i + 1, total))
def move_images(img_dir, dist_img_dir, dist_l_path):
for root, dirs, files in os.walk(dist_l_path):
total = len(files)
for i, file in enumerate(files):
image_file = os.path.splitext(file)[0] + ".jpg"
src = os.path.join(img_dir, image_file)
dst = os.path.join(dist_img_dir, image_file)
shutil.move(src, dst)
print("移动图片文件: %d/%d" % (i+1, total))
# 用于类别重映射的函数
def remap_categories(dist_l_path, old_classes, new_classes):
category_map = dict(zip(old_classes, new_classes))
for root, dirs, files in os.walk(dist_l_path):
total = len(files)
for i, file in enumerate(files):
f_path = os.path.join(dist_l_path, file)
with open(f_path, 'r') as f:
lines = f.readlines()
with open(f_path, 'w') as f_w:
for line in lines:
items = re.split(r"[ ]+", line.strip())
items[0] = str(category_map[items[0]]) # 将类别ID重映射
f_w.write(' '.join(items) + '\n') # 写回更新后的行
print("类别重映射进度: %d/%d" % (i + 1, total))
# 原始标签文件路径
labels_path = r"D:\yqh\jupyter_notebook\yolov7-main\datasets\coco2017\coco2017\labels\train2017"
# 目标标签文件路径
dist_labels_path = r"D:\yqh\jupyter_notebook\yolov7-main\datasets\coco2017\coco2017\labels\train"
# 目标类别列表,0: person 56: chair 57: couch 60: dining table 62: tv 63: laptop
# ['person', 'chair', 'couch', 'dining table', 'tv', 'laptop']
old_classes = ['0', '56', '57', '60', '62', '63']
new_classes = list(map(str, range(len(old_classes)))) # 生成新的连续类别ID
# 第一步:将含有指定class列表中任一class的txt文件移动到指定目录
e_files = get_class_exist_path(labels_path, old_classes)
move_txt_files(labels_path, e_files, dist_labels_path)
# 第二步:删除移动后的txt中,不属于指定class列表的class
delete_other_class(dist_labels_path, old_classes)
# 第三步:将含有指定class列表中任一class的images移动到指定目录
images_dir = r"D:\yqh\jupyter_notebook\yolov7-main\datasets\coco2017\coco2017\images\train2017"
dist_images_dir = r"D:\yqh\jupyter_notebook\yolov7-main\datasets\coco2017\coco2017\images\train"
move_images(images_dir, dist_images_dir, dist_labels_path)
# 第四步:类别重映射
remap_categories(dist_labels_path, old_classes, new_classes)
最后,感概一下 AI 辅助编程太省事了,部分代码是 AI 写的。

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)