删除voc数据集中指定的标签信息,并拷贝到新的输出文件夹中
本博客想记录一个比较常用的功能,在标注数据集时候可能会出现错误,可能是标注者标注文件标签时候多输入了一个空格或者小数点,或者输入标签1时候不小心多按了一个1,或者标注者理解错意思整个数据集标注的标签错误,以上情况用以下两种方式清洗数据。
·
1、删除voc数据集中指定的标签信息,并拷贝到新的输出文件夹中
本博客想记录一个比较常用的功能,在标注数据集时候可能会出现错误,可能是标注者标注文件标签时候多输入了一个空格或者小数点,或者输入标签1时候不小心多按了一个1,或者标注者理解错意思整个数据集标注的标签错误,以上情况用以下两种方式清洗数据。
2、筛查标注文件中所有的类别是否和场景所需要的标签一致
假设现在的标签为allowed_labels = ['1', '2', '3', '0']
, 以下代码会找出不是设定的标签值,且输出错误标签的绝对路径以及包含所有标签的labels.txt
标签文件。如果标签有错误,则会出现RuntimeError: CUDA error报错,点我进入解决问题。
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
def getClsTxt(xmlDir, cls_txt, allowed_labels):
"""
xmlDir : XML directory path
cls_txt : Output cls file path
allowed_labels: List of allowed labels
"""
invalid_label_paths = [] # List to store paths of XML files with invalid labels
for name in tqdm(os.listdir(xmlDir)):
xmlFile = os.path.join(xmlDir, name)
with open(xmlFile, "r+", encoding='utf-8') as fp:
tree = ET.parse(fp)
root = tree.getroot()
invalid_labels = set()
for obj in root.iter('object'):
cls_element = obj.find('name')
if cls_element is not None:
cls = cls_element.text
invalid_labels.add(cls)
if cls not in allowed_labels:
invalid_label_paths.append((xmlFile, cls)) # Store both XML path and invalid label
set_cls.update(invalid_labels)
if invalid_label_paths:
print("Invalid labels found in the following XML files:")
for path, invalid_label in invalid_label_paths:
print(f"{path}, Error category is: {invalid_label}")
else:
print("No invalid labels found.")
with open(cls_txt, "w+") as ft:
for i in set_cls:
ft.write(i + "\n")
if __name__ == '__main__':
set_cls = set()
xmlDir = "output/VOC-1205/Annotations"
cls_txt = "output/VOC-1205/labels.txt"
allowed_labels = ['1', '2', '3', '0']
getClsTxt(xmlDir, cls_txt, allowed_labels)
3、删除完全标错的某一个标签,不改变正确的标签
假设现在1和3标签是错误的(labels_to_remove = ['1', '3'])
,没有这几个标签,需要删除,代码如下:
import os
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm
def process_xml(xml_path, output_folder, labels_to_remove):
tree = ET.parse(xml_path)
root = tree.getroot()
remove_objects = []
for obj in root.findall('object'):
name = obj.find('name').text
if name in labels_to_remove:
remove_objects.append(obj)
for obj in remove_objects:
root.remove(obj)
new_xml_path = os.path.join(output_folder, 'Annotations', os.path.basename(xml_path))
tree.write(new_xml_path)
def process_dataset(input_folder, output_folder, labels_to_remove):
img_output_folder = os.path.join(output_folder, 'JPEGImages')
ann_output_folder = os.path.join(output_folder, 'Annotations')
os.makedirs(img_output_folder, exist_ok=True)
os.makedirs(ann_output_folder, exist_ok=True)
xml_files = [f for f in os.listdir(os.path.join(input_folder, 'Annotations')) if f.endswith('.xml')]
for xml_file in tqdm(xml_files, desc='Processing XML files'):
xml_path = os.path.join(input_folder, 'Annotations', xml_file)
img_file = os.path.join(input_folder, 'JPEGImages', os.path.splitext(xml_file)[0] + '.jpg')
if os.path.exists(xml_path) and os.path.exists(img_file):
process_xml(xml_path, output_folder, labels_to_remove)
shutil.copy(img_file, img_output_folder)
if __name__ == "__main__":
input_folder = './output/VOC-1205'
output_folder = './output/VOC-1206'
labels_to_remove = ['1', '3']
process_dataset(input_folder, output_folder, labels_to_remove)
4、删除.xml文件中没有一条bbox信息的文件以及图片
可能经过了上述的章节3,删除一些标签后,该xml
可能没有一个bbox
框,所以可以删除图片以及.xml
文件, 代码如下
import os
import shutil
import xml.etree.ElementTree as ET
def copy_non_empty_bndbox_files(input_folder, output_folder):
# 创建输出文件夹
output_images_folder = os.path.join(output_folder, 'JPEGImages')
output_annotations_folder = os.path.join(output_folder, 'Annotations')
os.makedirs(output_images_folder, exist_ok=True)
os.makedirs(output_annotations_folder, exist_ok=True)
# 遍历原始文件夹
for filename in os.listdir(input_folder):
if filename.endswith('.xml'):
xml_path = os.path.join(input_folder, filename)
image_filename = os.path.splitext(filename)[0] + '.jpg'
image_path = os.path.join(input_folder, image_filename)
# 解析 XML 文件
tree = ET.parse(xml_path)
root = tree.getroot()
# 查找是否有非空的 bndbox
bndbox_elements = root.findall('.//bndbox')
if any(bndbox_elements):
# 复制图片文件
shutil.copy(image_path, os.path.join(output_images_folder, image_filename))
# 复制 XML 文件
shutil.copy(xml_path, os.path.join(output_annotations_folder, filename))
if __name__ == "__main__":
input_folder = './origin_dataset/VOC'
output_folder = './output/VOC-1206'
copy_non_empty_bndbox_files(input_folder, output_folder)
5、划分voc数据集以及转换为coco数据集格式

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)