使用CNNs网络，基于caltech 101数据集实现分类

首先下载101数据集caltech 101把文件解压，路径格式如图所示。此处是以pycharm为基础（pycharm以当前文件夹的目录为根目录）使用该数据集，制作npy易读文件格式def data_process(img_size):imgs = []labels = []img_size = img_sizesize = (img_size, img_size)for i, category i

带霸气的骑士

5291人浏览 · 2021-01-12 16:55:44

带霸气的骑士 · 2021-01-12 16:55:44 发布

首先下载101数据集

caltech 101

把文件解压，路径格式如图所示。

此处是以pycharm为基础（pycharm以当前文件夹的目录为根目录）

路径描述

使用该数据集，制作npy易读文件格式

def data_process(img_size):
    imgs = []
    labels = []
    img_size = img_size
    size = (img_size, img_size)

    for i, category in enumerate(tqdm(categories)):
        for f in os.listdir(path + "/" + categories[i]):
            fullpath = os.path.join(path + "/" + categories[i], f)
            # print(fullpath)
            img = Image.open(fullpath)
            img = np.asarray(img.resize(size, Image.ANTIALIAS))
            # img = np.asarray(img.resize(size)
            if img.shape == (img_size, img_size, 3):
                imgs.append(np.array(img))
                label_curr = i
                labels.append(label_curr)
                # imgs_temp = [imgs, labels]
    np.save(root_path + '/' + 'x'+str(img_size), imgs)
    np.save(root_path + '/' + 'y'+str(img_size), labels)
img_size = 200
full_path =root_path + '/' + 'x'+str(img_size)
if os.path.exists(full_path) is True:
    data_process(img_size)
    print("{} file already exists.".format(full_path))

相应的模块就import一下，后续会展示详细的代码。
此时就会在dataset目录下生成x200.npy和y200.npy两个文件。后续处理的时候，就会直接读取这两个npy文件作为输入数据集。

下面开始说明网络训练过程和导入。

cal_101_googlenet.py

from keras import backend as K
from keras.utils import np_utils
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
from modles.googlenet import GoogLeNetBN

# set GPU usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction = 0.4
set_session(tf.Session(config=config))
# 导入相应的模块以及进行GPU的设置

# 几个超参数的设计
image_size = 200
classes = 101

root_path = 'dataset'
path = 'dataset/Caltech 101/101_ObjectCategories'
categories = sorted(os.listdir(path))
ncategories = len(categories)
print(ncategories)
## 设置数据集的路径以及有多少类

def data_process(img_size):
    imgs = []
    labels = []
    img_size = img_size
    size = (img_size, img_size)

    for i, category in enumerate(tqdm(categories)):
        for f in os.listdir(path + "/" + categories[i]):
            fullpath = os.path.join(path + "/" + categories[i], f)
            # print(fullpath)
            img = Image.open(fullpath)
            img = np.asarray(img.resize(size, Image.ANTIALIAS))
            # img = np.asarray(img.resize(size)
            if img.shape == (img_size, img_size, 3):
                imgs.append(np.array(img))
                label_curr = i
                labels.append(label_curr)
                # imgs_temp = [imgs, labels]
    np.save(root_path + '/' + 'x'+str(img_size), imgs)
    np.save(root_path + '/' + 'y'+str(img_size), labels)
img_size = image_size #设置图片的大小，因为会裁剪图片
full_path =root_path + '/' + 'x'+str(img_size)
if os.path.exists(full_path) is True:
    data_process(img_size)
    print("{} file already exists.".format(full_path))
## 数据集处理

x = np.load('dataset/x%s.npy' % img_size, mmap_mode=None, allow_pickle=True, fix_imports=True, encoding='ASCII')
y = np.load('dataset/y%s.npy' % img_size, mmap_mode=None, allow_pickle=True, fix_imports=True, encoding='ASCII')
print("successfully load x%s.npy" % img_size)
## 载入数据
plt.imshow(x[96])
plt.show()
## 查看载入是否正确


seed = 7
np.random.seed(seed)
# import pandas as pd
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
X_train = np.stack(X_train, axis=0)
y_train = np.stack(y_train, axis=0)
X_test = np.stack(X_test, axis=0)
y_test = np.stack(y_test, axis=0)
print("Num train_imgs: %d" % (len(X_train)))
print("Num test_imgs: %d" % (len(X_test)))
# # one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
## 导入数据，拆分为训练集和测试集，0.8:0.2

X_train = X_train.reshape((int(len(X_train)), img_size, img_size, 3))
X_test = X_test.reshape((int(len(X_test)), img_size, img_size, 3))

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
## 调整数据的shape


import numpy as np
from keras.callbacks import ReduceLROnPlateau, CSVLogger, EarlyStopping
lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6)
early_stopper = EarlyStopping(monitor='val_acc', min_delta=0.001, patience=10, mode='max')
csv_logger = CSVLogger('googlenet_caltech101')
model = GoogLeNetBN(input_shape=(img_size, img_size, 3), classes=classes)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
## 导入模型

#train the model
Y_train = y_train
Y_test = y_test
data_augmentation = False## 是否使用数据增强
from keras.preprocessing.image import ImageDataGenerator
if not data_augmentation:
    print('Not using data augmentation.')
    model.fit(X_train, Y_train,
              batch_size=32,
              nb_epoch=400,
              validation_data=(X_test, Y_test),
              shuffle=True,
              verbose=2,
              callbacks=[lr_reducer, early_stopper, csv_logger])
else:
    print('Using real-time data augmentation.')
    # This will do preprocessing and realtime data augmentation:
    datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=True,  # randomly flip images
        vertical_flip=False)  # randomly flip images

    # Compute quantities required for featurewise normalization
    # (std, mean, and principal components if ZCA whitening is applied).
    datagen.fit(X_train)

    # Fit the model on the batches generated by datagen.flow().
    model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                        steps_per_epoch=X_train.shape[0] // 32,
                        validation_data=(X_test, Y_test),
                        epochs=400, verbose=2, max_q_size=257,
                        callbacks=[lr_reducer, early_stopper, csv_logger])

googlenet.py



from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from keras import backend
from keras import layers
from keras import models
from keras import utils as keras_utils
from modles.pym_attention import se_block,cbam_block,pyramid_attention_block_1

def conv2d_bn(x, filters, kernel_size=(3, 3), padding='same', strides=(1, 1),
              name=None):

    if name is not None:
        bn_name = name + '_bn'
        conv_name = name + '_conv'
    else:
        bn_name = None
        conv_name = None
    if backend.image_data_format() == 'channels_first':
        bn_axis = 1
    else:
        bn_axis = 3
    x = layers.Conv2D(
        filters,
        kernel_size,
        strides=strides,
        padding=padding,
        use_bias=False,
        name=conv_name)(x)
    x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
    x = layers.Activation('relu', name=name)(x)
    return x

def inception(x, filters):
    """Utility function to implement the inception module.

    # Arguments
        x: input tensor.
        filters: a list of filter sizes.

    # Returns
        Output tensor after applying the inception.
    """
    if len(filters) != 4:
        raise ValueError('filters should have 4 components')
    if len(filters[1]) != 2 or len(filters[2]) != 2:
        raise ValueError('incorrect spec of filters')

    branch1x1 = conv2d_bn(x, filters[0], (1, 1))

    branch3x3 = conv2d_bn(x, filters[1][0], (1, 1))
    branch3x3 = conv2d_bn(branch3x3, filters[1][1], (3, 3))

    branch5x5 = conv2d_bn(x, filters[2][0], (1, 1))
    branch5x5 = conv2d_bn(branch5x5, filters[2][1], (5, 5))

    branchpool = layers.AveragePooling2D(
        pool_size=(3, 3), strides=(1, 1), padding='same')(x)
    branchpool = conv2d_bn(branchpool, filters[3], (1, 1))

    if backend.image_data_format() == 'channels_first':
        concat_axis = 1
    else:
        concat_axis = 3
    x = layers.concatenate(
        [branch1x1, branch3x3, branch5x5, branchpool], axis=concat_axis)
    return x


def GoogLeNetBN(include_top=True,
                weights=None,
                input_tensor=None,
                input_shape=None,
                pooling='max',
                classes=1000,
                **kwargs):
    """Instantiates the GoogLeNetBN architecture.

    # Arguments
        include_top: whether to include the fully-connected
            layer at the top of the network.
        weights: must be None.
        input_tensor: Keras tensor (i.e. output of `layers.Input()`)
            to use as image input for the model.
        input_shape: input tensor shape, which is used to create an
            input tensor if `input_tensor` is not specified.
        pooling: Optional pooling mode for feature extraction
            when `include_top` is `False`.
            - `None` means that the output of the model will be
                the 4D tensor output of the last convolutional block.
            - `avg` means that global average pooling will be applied
                to the output of the last convolutional block, and thus
                the output of the model will be a 2D tensor.
            - `max` means that global max pooling will be applied.
        classes: optional number of classes to classify images
            into, only to be specified if `include_top` is True.

    # Returns
        A Keras model instance.

    # Raises
        ValueError: in case of invalid argument for `weights`,
            or invalid input shape.
    """
    att_block = None
    if weights is not None:
        raise ValueError('weights is not currently supported')
    if input_tensor is None:
        if input_shape is None:
            raise ValueError('neither input_tensor nor input_shape is given')
        img_input = layers.Input(shape=input_shape)
    else:
        if not backend.is_keras_tensor(input_tensor):
            img_input = layers.Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor

    x = conv2d_bn(img_input, 64, (7, 7), strides=(2, 2))
    x = layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    if att_block is not None:
        x = att_block(x, name='a')
    x = conv2d_bn(x,  64, (1, 1))
    x = conv2d_bn(x, 192, (3, 3))
    x = layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    if att_block is not None:
        x = att_block(x, name='b')
    x = inception(x, ( 64,  (96, 128), (16,  32),  32))  # 3a: 28x28x256
    x = inception(x, (128, (128, 192), (32,  96),  64))  # 3b: 28x28x480
    x = layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    if att_block is not None:
        x = att_block(x, name='c')
    x = inception(x, (192,  (96, 208), (16,  48),  64))  # 4a: 14x14x512
    x = inception(x, (160, (112, 224), (24,  64),  64))  # 4b: 14x14x512
    x = inception(x, (128, (128, 256), (24,  64),  64))  # 4c: 14x14x512
    x = inception(x, (112, (144, 288), (32,  64),  64))  # 4d: 14x14x528
    x = inception(x, (256, (160, 320), (32, 128), 128))  # 4e: 14x14x832
    x = layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    if att_block is not None:
        x = att_block(x, name='d')
    x = inception(x, (256, (160, 320), (32, 128), 128))  # 5a: 7x7x832
    x = inception(x, (384, (192, 384), (48, 128), 128))  # 5b: 7x7x1024

    if include_top:
        # Classification block
        if pooling == 'avg':
            x = layers.GlobalAveragePooling2D(name='global_pool')(x)
        elif pooling == 'max':
            x = layers.GlobalMaxPooling2D(name='global_pool')(x)
        else:
            raise ValueError('bad spec of global pooling')
        x = layers.Dropout(0.4)(x)
        x = layers.Dense(classes, activation='softmax', name='predictions')(x)

    # Ensure that the model takes into account
    # any potential predecessors of `input_tensor`.
    if input_tensor is not None:
        inputs = keras_utils.get_source_inputs(input_tensor)
    else:
        inputs = img_input

    # Create model.
    model = models.Model(inputs, x, name='googlenet_bn')

    return model

运行环境

python=3.6
keras=2.1.2
pandas=1.1.5
pillow=8.0.1
scikit-learn=0.24.0
scipy=1.5.4
tensorflow-gpu=1.3.0
tqdm=4.54.1

开始训练

把googlenet.py和cal_101_googlenet.py和dataset放在同一路径下，用pycharm打开这个路径。然后直接run cal_101_googlenet.py即可。

结果说明

在训练过程中，设置了earlystopping，即10个epoch内没有提高0.001的话就会停止迭代。所以最后一次往前数10个epoch一般是最高的准确率。
我的跑出来结果是【0.9931,0.7066】

魔乐社区

魔乐社区（Modelers.cn) 是一个中立、公益的人工智能社区，提供人工智能工具、模型、数据的托管、展示与应用协同服务，为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作，由全产业链共同建设、共同运营、共同享有，推动国产AI生态繁荣发展。

更多推荐

替你试过了，消费级显卡可以跑的开源文生图SOTA模型，顶级渲染、高密度文本绘图

魔乐社区

量化挑战赛冠军专访：4小时啃下W4A8量化，我靠的是这些经验

魔乐社区

小参数・大码力・易部署 | Qwen3.6-27B上线魔乐社区，基于昇腾的部署教程来了

继一周前模型开源发布后，千问再度开源Qwen3.6-27B —— 一个拥有270亿参数的稠密多模态模型，也是社区呼声最高的模型规格。Qwen3.6-27B 依然支持多模态思考与非思考模式，在智能体编程方面达到了旗舰级表现，全面超越前代开源旗舰 Qwen3.5-397B-A17B（总参数397B / 激活参数17B的MoE模型）。作为稠密架构，它无需MoE路由即可部署，是开发者在实用、可广泛部署规模