Table of Contents

使用之前的去噪图像来抑制文本

1.经验

本菜鸡本科毕设在FPGA上搞过图像滤波等算法，研究生期间虽然搞的是基于深度学习的图形学，但是主干网络用的还是卷积… 感觉自己代码能力还可以，基础还行，参赛之前还是比较自信的：
觉着看几篇顶会去噪的文章，复现借鉴一下应该能取得一个不错的结果，但是——-大概1000+人参赛，一多半没有提交的或者只提交个baseline，本菜最终100+ 额还没结束明天结束了估计排名快接近200了实在卷不动了
主要存在三个问题：

和男生的能力差距还是不小的，可能也和研究方向有关。毕竟不是专业的。
Money is all my need?? 经历了学校服务器排队人数爆满、维修，小组里的机器也排不上队(我一个参加比赛的也不好意思和别人抢- -)，就想调个参，很难，即便排上队 – 跑的时候batch_size都调的很小才能跑
我——工具人……论文直接被拒了审稿人到底提了什么——

虽然我知道我自己的食物，但我还是想尝试一下。吐槽结束，进入话题:::::

2. 收获

谈收获，虽难，但收获也不少

看了几篇cv顶会的去噪文章，了解并尝试了cv算法中low-level的方向
尝试复现了两篇顶会，效果并没有baseline好，差不太多 – – (可能复现的不太对，毕竟只是借用思想不是完全拷贝) 最终魔改了一篇别的论文.
从dataloder、网络框架、网络初始化、训练策略到最后的损失函数等等，第一次完整的写了一个深度学习的项目(以前都是拿别人代码框架改改)，遇到很多坑，也学到了许多新的知识点

3.经验分享（部分源码展示和注释）

3.1 输入

图片是要切片的，一整张图太大了，网络稍大点，32G的显卡也会爆显存
将一个图划分为多个图，伪代码如下：

# 外层是一个循环 根据图像大小进行切片 
tmp['imgs'] = data['imgs'][:, :, a:b, c:d]     # batch 通道数 图片的长和宽
tmp['gts'] = data['gts'][:, :, a:b, c:d]       # 标签
model.set_input(tmp)                           # 网络输入

3.2 网络

我的网络借鉴的主要思想：

1.不直接学习端到端的像素值，而是学习噪声(网络更容易拟合?)
2.使用通道可分离的卷积，适当增加通道数(显存太小，跑起来速度很慢)
3.尝试增加卷积核大小(显存太小，跑起来速度很慢)

（比赛有模型大小限制）——增加通道和卷积核会增加显存的使用，设备不好，所以只有增加通道数。具体实施细则如下：

纯纯的Unet baseline修改而来

class Unet2(nn.Module):
    def __init__(self, dim=4):
        super(Unet2, self).__init__()
        self.dims = [32, 64, 128, 256, 512]
        self.ks = [3, 3, 3, 3, 3]
        self.dims_up = self.dims[::-1]
        self.ks_up = self.ks[-2::-1]

        self.first_block = Block2(dim, self.dims[0], self.ks[0])
        self.first_pool = nn.MaxPool2d(kernel_size=2)  # AvgPool2d pnsr: 37.683, ssim: 0.902, score: 30.679, time: 52.650

        for i, dim_in in enumerate(self.dims[:-2]):
            dim_out = self.dims[i+1]
            setattr(self, 'Block{}'.format(i), Block2(dim_in, dim_out, k=self.ks[i+1]))
            setattr(self, 'pool{}'.format(i), nn.MaxPool2d(kernel_size=2))

        self.conv_mid = Block2(self.dims[-2], self.dims[-1], self.ks[-1])

        for i, dim_in in enumerate(self.dims_up[:-1]):
            dim_out = self.dims_up[i+1]
            setattr(self, 'ConvTrans{}'.format(i), nn.ConvTranspose2d(dim_in, dim_out, 2, stride=2, bias=True))
            setattr(self, 'up_Block{}'.format(i), Block2(dim_in, dim_out, k=self.ks_up[i]))

        self.last_conv = nn.Conv2d(self.dims[0], dim, 1, bias=True)

    def forward(self, x):
        n, c, h, w = x.shape
        h_pad = 32 - h % 32 if not h % 32 == 0 else 0
        w_pad = 32 - w % 32 if not w % 32 == 0 else 0
        padded_image = F.pad(x, (0, w_pad, 0, h_pad), 'replicate')
        list_pools = []

        x_bk = x
        # 1.first Block
        x = self.first_block(padded_image)
        list_pools.append(x)
        x = self.first_pool(x)

        # 2.Blocks
        for i, dim_in in enumerate(self.dims[:-2]):
            x = getattr(self, 'Block{}'.format(i))(x)
            list_pools.append(x)
            x = getattr(self, 'pool{}'.format(i))(x)

        x = self.conv_mid(x)

        for i, dim_in in enumerate(self.dims_up[:-1]):
            x = getattr(self, 'ConvTrans{}'.format(i))(x)
            # tmp = list_pools.pop()
            x = torch.cat([x, list_pools.pop()], 1)
            x = getattr(self, 'up_Block{}'.format(i))(x)

        # 3.last
        x = self.last_conv(x)
        out = x[:, :, :h, :w] + x_bk

        return out


class Block2(nn.Module):
    def __init__(self, dim_in, dim_out, k=3):
        super(Block2, self).__init__()
        self.conv1 = nn.Conv2d(dim_in, dim_in, kernel_size=k, padding=k // 2, padding_mode='zeros', bias=True)
        self.conv2 = nn.Conv2d(dim_in, dim_out, kernel_size=k, padding=k // 2, padding_mode='zeros', bias=True)

    def forward(self, x):
        x = self.conv1(x)
        x = self.leaky_relu(x)
        x = self.conv2(x)
        x = self.leaky_relu(x)
        return x

    def leaky_relu(self, x, a=0.2):
        out = torch.max(a * x, x)
        return out

我使用的网络魔改ConvNet

class Our(nn.Module):
    def __init__(self, dim=4):
        super(Our, self).__init__()
        self.dims = [128, 256, 512, 1024]
        self.ks = [3, 3, 3, 3]
        # 内存不够啊

        # self.dims = [16, 32, 64, 128, 256]
        # self.ks = [23, 23, 23, 17, 3]
        ######################################
        self.dims_up = self.dims[::-1]
        self.ks_up = self.ks[-2::-1]

        self.first_block = Block(dim, self.dims[0], self.ks[0])
        self.first_pool = nn.MaxPool2d(kernel_size=2)

        for i, dim_in in enumerate(self.dims[:-2]):
            dim_out = self.dims[i+1]
            setattr(self, 'Block{}'.format(i), Block(dim_in, dim_out, k=self.ks[i+1]))
            setattr(self, 'pool{}'.format(i), nn.MaxPool2d(kernel_size=2))

        self.conv_mid = Block(self.dims[-2], self.dims[-1], self.ks[-1])

        for i, dim_in in enumerate(self.dims_up[:-1]):
            dim_out = self.dims_up[i+1]
            setattr(self, 'ConvTrans{}'.format(i), nn.ConvTranspose2d(dim_in, dim_out, 2, stride=2))
            setattr(self, 'up_Block{}'.format(i), Block(dim_in, dim_out, k=self.ks_up[i]))

        self.last_ln = nn.LayerNorm(self.dims[0], eps=1e-6)
        self.last_conv = nn.Linear(self.dims[0], dim)

    def forward(self, x):
        n, c, h, w = x.shape
        h_pad = 32 - h % 32 if not h % 32 == 0 else 0
        w_pad = 32 - w % 32 if not w % 32 == 0 else 0
        padded_image = F.pad(x, (0, w_pad, 0, h_pad), 'replicate')
        list_pools = []
        x_bk = x
        # 1.first Block
        x = self.first_block(padded_image)
        list_pools.append(x)
        x = self.first_pool(x)

        # 2.Blocks
        for i, dim_in in enumerate(self.dims[:-2]):
            x = getattr(self, 'Block{}'.format(i))(x)
            list_pools.append(x)
            x = getattr(self, 'pool{}'.format(i))(x)

        x = self.conv_mid(x)

        for i, dim_in in enumerate(self.dims_up[:-1]):
            x = getattr(self, 'ConvTrans{}'.format(i))(x)
            # tmp = list_pools.pop()
            x = torch.cat([x, list_pools.pop()], 1)
            x = getattr(self, 'up_Block{}'.format(i))(x)

        # 3.last
        x = x.permute(0, 2, 3, 1).contiguous()
        x = self.last_ln(x)
        x = self.last_conv(x)
        x = x.permute(0, 3, 1, 2).contiguous()
        out = x[:, :, :h, :w] + x_bk

        return out



class Block(nn.Module):
    def __init__(self, dim_in, dim_out, k=9):
        super(Block, self).__init__()
        self.conv = nn.Conv2d(dim_in, dim_in, groups=dim_in, kernel_size=k, padding=k // 2)
        self.ln = nn.LayerNorm(dim_in,eps=1e-6)
        self.conv1x1up = nn.Linear(dim_in, dim_in * 2) #nn.Conv2d(dim, dim * 2, 1)
        self.act = nn.GELU()
        self.conv1x1dn = nn.Linear(dim_in * 2, dim_out) #nn.Conv2d(dim * 2, dim, 1)
        self.w = nn.Parameter(torch.zeros(1))
        # res
        self.res_conv = nn.Conv2d(dim_in, dim_out, 1)

    def forward(self, x):
        identity = x
        x = self.conv(x)
        x = x.permute(0, 2, 3, 1).contiguous()
        x = self.ln(x)
        x = self.conv1x1up(x)
        x = self.act(x)
        x = self.conv1x1dn(x)
        x = x.permute(0, 3, 1, 2).contiguous()
        x = x * self.w
        x = x + self.res_conv(identity)
        return x

3.3 损失函数

loss = torch.nn.L1Loss()

实测了一下，还是L1效果好啊
其它L2、SSIM之类的花里胡哨的效果并不理想 (毕竟是炼丹，可能只是不适合我的网络)

3.4 传统滤波方法

哈、我还试了一下传统的去噪，顺便使用纯python写了一个双边滤波(参考我以前matlab的代码)，不得不说，还是深度学习yyds!

def bilateral_filter(img):
    # 参考自己博客 matlab的实现 https://blog.csdn.net/qq_38204686/article/details/106929922
    r = 20                      # 窗口半径     核大小为 2*r + 1
    sigma_space = 15.0          # 空间标准差
    sigma_color = 10.0          # 相似标准差
    w_space = np.zeros((2*r + 1, 2*r + 1))
    for i in range(-r-1, r):
        for j in range(-r-1, r):
            tmp = i * i + j * j
            w_space[i + r+1, j + r+1] = np.exp(-float(tmp) / (2 * sigma_space * sigma_space))
    w_color = np.zeros((1, 256))
    for i in range(256):
        w_color[0, i] = np.exp(-float(i * i) / (2 * sigma_color * sigma_color))

    # 开始滤波
    height, width, channel = img.shape
    dst_img = img.copy()
    for h in range(r, height - r):
        # s = time.time()   0.3s
        for w in range(r, height - r):
            for c in range(channel):     # 通道遍历
                p_c = img[h, w, c]       # 像素值
                p_win = img[h-r:h+r+1, w-r:w+r+1, c]  # 窗口内所有像素
                c_w = np.abs(p_win - p_c).astype(int)
                c_w = w_color[0, c_w]
                w_tmp = w_space * c_w
                p_sum = p_win * w_tmp
                p_sum = np.sum(p_sum) / np.sum(w_tmp)
                dst_img[h, w, c] = p_sum

    return dst_img

4. 主要参考链接

https://zhuanlan.zhihu.com/p/455913104 (ConvNeXt: A ConvNet for the 2020s)
https://zhuanlan.zhihu.com/p/349644858 (如何白嫖GPU)
https://blog.csdn.net/u011447962/article/details/123510680 (CVPR 2022 | RepLKNet)
https://github.com/gbstack/CVPR-2022-papers#SG (CVPR2022 Papers (Papers/Codes/Demos))

文章出处登录后可见！

已经登录？立即刷新

python 参加某图像去噪比赛有感