Spaces:

umuthopeyildirim
/

IEBins-Depth-Estimation

Runtime error

App Files Files Community

umuthopeyildirim commited on Jan 22, 2024

Commit

bd86ed9

1 Parent(s): db5b5dc

here we go

Browse files

Files changed (27) hide show

.DS_Store +0 -0
app.py +109 -0
checkpoints/kittieigen_L.pth +3 -0
checkpoints/nyu_L.pth +3 -0
iebins/dataloaders/__init__.py +0 -0
iebins/dataloaders/__pycache__/__init__.cpython-38.pyc +0 -0
iebins/dataloaders/__pycache__/dataloader.cpython-38.pyc +0 -0
iebins/dataloaders/__pycache__/dataloader_sun.cpython-38.pyc +0 -0
iebins/dataloaders/dataloader.py +343 -0
iebins/dataloaders/dataloader_sun.py +326 -0
iebins/eval.py +177 -0
iebins/eval_sun.py +179 -0
iebins/inference_single_image.py +117 -0
iebins/networks/NewCRFDepth.py +318 -0
iebins/networks/__init__.py +0 -0
iebins/networks/depth_update.py +39 -0
iebins/networks/newcrf_layers.py +433 -0
iebins/networks/newcrf_utils.py +264 -0
iebins/networks/resize.py +51 -0
iebins/networks/swin_transformer.py +620 -0
iebins/networks/uper_crf_head.py +364 -0
iebins/sum_depth.py +22 -0
iebins/test.py +209 -0
iebins/train.py +499 -0
iebins/utils.py +356 -0
iebins/utils/transfrom.py +250 -0
requirements.txt +12 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import gradio as gr
+import cv2
+import numpy as np
+import os
+from PIL import Image
+import spaces
+import torch
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+import tempfile
+from gradio_imageslider import ImageSlider
+from iebins.networks.NewCRFDepth import NewCRFDepth
+from iebins.utils.transfrom import Resize, NormalizeImage, PrepareForNet
+css = """
+#img-display-container {
+    max-height: 100vh;
+    }
+#img-display-input {
+    max-height: 80vh;
+    }
+#img-display-output {
+    max-height: 80vh;
+    }
+"""
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = NewCRFDepth(version="large07", inv_depth=False,
+                    max_depth=10, pretrained=None).to(DEVICE).eval()
+model.load_state_dict(torch.load('checkpoints/nyu_L.pth'))
+title = "# IEBins: Iterative Elastic Bins for Monocular Depth Estimation"
+description = """Demo for **IEBins: Iterative Elastic Bins for Monocular Depth Estimation**.
+Please refer to the [paper](https://arxiv.org/abs/2309.14137), [github](https://github.com/ShuweiShao/IEBins), or [poster](https://nips.cc/media/PosterPDFs/NeurIPS%202023/70695.png?t=1701662442.5228624) for more details."""
+transform = Compose([
+    Resize(
+        width=518,
+        height=518,
+        resize_target=False,
+        keep_aspect_ratio=True,
+        ensure_multiple_of=14,
+        resize_method='lower_bound',
+        image_interpolation_method=cv2.INTER_CUBIC,
+    ),
+    NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    PrepareForNet(),
+])
+@spaces.GPU
+@torch.no_grad()
+def predict_depth(model, image):
+    return model(image)
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown("### Depth Prediction demo")
+    gr.Markdown(
+        "You can slide the output to compare the depth prediction with input image")
+    with gr.Row():
+        input_image = gr.Image(label="Input Image",
+                               type='numpy', elem_id='img-display-input')
+        depth_image_slider = ImageSlider(
+            label="Depth Map with Slider View", elem_id='img-display-output', position=0.5,)
+    raw_file = gr.File(
+        label="16-bit raw depth (can be considered as disparity)")
+    submit = gr.Button("Submit")
+    def on_submit(image):
+        original_image = image.copy()
+        h, w = image.shape[:2]
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)
+        depth = predict_depth(model, image)
+        depth = F.interpolate(depth[None], (h, w),
+                              mode='bilinear', align_corners=False)[0, 0]
+        raw_depth = Image.fromarray(depth.cpu().numpy().astype('uint16'))
+        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        raw_depth.save(tmp.name)
+        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+        depth = depth.cpu().numpy().astype(np.uint8)
+        colored_depth = cv2.applyColorMap(
+            depth, cv2.COLORMAP_INFERNO)[:, :, ::-1]
+        return [(original_image, colored_depth), tmp.name]
+    submit.click(on_submit, inputs=[input_image], outputs=[
+                 depth_image_slider, raw_file])
+    example_files = os.listdir('examples')
+    example_files.sort()
+    example_files = [os.path.join('examples', filename)
+                     for filename in example_files]
+    examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[
+                           depth_image_slider, raw_file], fn=on_submit, cache_examples=False)
+if __name__ == '__main__':
+    demo.queue().launch()

checkpoints/kittieigen_L.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf10549a615b19b96ffdddc82e639662c421fe0cd30008cc3cf3e7d4bffa5f55
+size 3276188594

checkpoints/nyu_L.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81d95d5f26f5d01b7e8b060467eef77ea6efea4ddf100d60f5fad87e6c0daae7
+size 3276188594

iebins/dataloaders/__init__.py ADDED Viewed

File without changes

iebins/dataloaders/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (173 Bytes). View file

iebins/dataloaders/__pycache__/dataloader.cpython-38.pyc ADDED Viewed

Binary file (9.15 kB). View file

iebins/dataloaders/__pycache__/dataloader_sun.cpython-38.pyc ADDED Viewed

Binary file (8.93 kB). View file

iebins/dataloaders/dataloader.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import torch.utils.data.distributed
+from torchvision import transforms
+import numpy as np
+from PIL import Image
+import os
+import random
+import copy
+from utils import DistributedSamplerNoEvenlyDivisible
+def _is_pil_image(img):
+    return isinstance(img, Image.Image)
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+def preprocessing_transforms(mode):
+    return transforms.Compose([
+        ToTensor(mode=mode)
+    ])
+class NewDataLoader(object):
+    def __init__(self, args, mode):
+        if mode == 'train':
+            self.training_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
+            if args.distributed:
+                self.train_sampler = torch.utils.data.distributed.DistributedSampler(self.training_samples)
+            else:
+                self.train_sampler = None
+            self.data = DataLoader(self.training_samples, args.batch_size,
+                                   shuffle=(self.train_sampler is None),
+                                   num_workers=args.num_threads,
+                                   pin_memory=True,
+                                   sampler=self.train_sampler)
+        elif mode == 'online_eval':
+            self.testing_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
+            if args.distributed:
+                # self.eval_sampler = torch.utils.data.distributed.DistributedSampler(self.testing_samples, shuffle=False)
+                self.eval_sampler = DistributedSamplerNoEvenlyDivisible(self.testing_samples, shuffle=False)
+            else:
+                self.eval_sampler = None
+            self.data = DataLoader(self.testing_samples, 1,
+                                   shuffle=False,
+                                   num_workers=1,
+                                   pin_memory=True,
+                                   sampler=self.eval_sampler)
+        elif mode == 'test':
+            self.testing_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
+            self.data = DataLoader(self.testing_samples, 1, shuffle=False, num_workers=1)
+        else:
+            print('mode should be one of \'train, test, online_eval\'. Got {}'.format(mode))
+class DataLoadPreprocess(Dataset):
+    def __init__(self, args, mode, transform=None, is_for_online_eval=False):
+        self.args = args
+        if mode == 'online_eval':
+            with open(args.filenames_file_eval, 'r') as f:
+                self.filenames = f.readlines()
+        else:
+            with open(args.filenames_file, 'r') as f:
+                self.filenames = f.readlines()
+        self.mode = mode
+        self.transform = transform
+        self.to_tensor = ToTensor
+        self.is_for_online_eval = is_for_online_eval
+    def __getitem__(self, idx):
+        sample_path = self.filenames[idx]
+        # focal = float(sample_path.split()[2])
+        focal = 518.8579
+        if self.mode == 'train':
+            if self.args.dataset == 'kitti':
+                rgb_file = sample_path.split()[0]
+                depth_file = os.path.join(sample_path.split()[0].split('/')[0], sample_path.split()[1])
+                if self.args.use_right is True and random.random() > 0.5:
+                    rgb_file = rgb_file.replace('image_02', 'image_03')
+                    depth_file = depth_file.replace('image_02', 'image_03')
+            else:
+                rgb_file = sample_path.split()[0]
+                depth_file = sample_path.split()[1]
+            image_path = os.path.join(self.args.data_path, rgb_file)
+            depth_path = os.path.join(self.args.gt_path, depth_file)
+            image = Image.open(image_path)
+            depth_gt = Image.open(depth_path)
+            if self.args.do_kb_crop is True:
+                height = image.height
+                width = image.width
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                depth_gt = depth_gt.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
+                image = image.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            # To avoid blank boundaries due to pixel registration
+            if self.args.dataset == 'nyu':
+                if self.args.input_height == 480:
+                    depth_gt = np.array(depth_gt)
+                    valid_mask = np.zeros_like(depth_gt)
+                    valid_mask[45:472, 43:608] = 1
+                    depth_gt[valid_mask==0] = 0
+                    depth_gt = Image.fromarray(depth_gt)
+                else:
+                    depth_gt = depth_gt.crop((43, 45, 608, 472))
+                    image = image.crop((43, 45, 608, 472))
+            if self.args.do_random_rotate is True:
+                random_angle = (random.random() - 0.5) * 2 * self.args.degree
+                image = self.rotate_image(image, random_angle)
+                depth_gt = self.rotate_image(depth_gt, random_angle, flag=Image.NEAREST)
+            image = np.asarray(image, dtype=np.float32) / 255.0
+            depth_gt = np.asarray(depth_gt, dtype=np.float32)
+            depth_gt = np.expand_dims(depth_gt, axis=2)
+            if self.args.dataset == 'nyu':
+                depth_gt = depth_gt / 1000.0
+            else:
+                depth_gt = depth_gt / 256.0
+            if image.shape[0] != self.args.input_height or image.shape[1] != self.args.input_width:
+                image, depth_gt = self.random_crop(image, depth_gt, self.args.input_height, self.args.input_width)
+            image, depth_gt = self.train_preprocess(image, depth_gt)
+            # https://github.com/ShuweiShao/URCDC-Depth
+            image, depth_gt = self.Cut_Flip(image, depth_gt)
+            sample = {'image': image, 'depth': depth_gt, 'focal': focal}
+        else:
+            if self.mode == 'online_eval':
+                data_path = self.args.data_path_eval
+            else:
+                data_path = self.args.data_path
+            image_path = os.path.join(data_path, "./" + sample_path.split()[0])
+            image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+            if self.mode == 'online_eval':
+                gt_path = self.args.gt_path_eval
+                depth_path = os.path.join(gt_path, "./" + sample_path.split()[1])
+                if self.args.dataset == 'kitti':
+                    depth_path = os.path.join(gt_path, sample_path.split()[0].split('/')[0], sample_path.split()[1])
+                has_valid_depth = False
+                try:
+                    depth_gt = Image.open(depth_path)
+                    has_valid_depth = True
+                except IOError:
+                    depth_gt = False
+                    # print('Missing gt for {}'.format(image_path))
+                if has_valid_depth:
+                    depth_gt = np.asarray(depth_gt, dtype=np.float32)
+                    depth_gt = np.expand_dims(depth_gt, axis=2)
+                    if self.args.dataset == 'nyu':
+                        depth_gt = depth_gt / 1000.0
+                    else:
+                        depth_gt = depth_gt / 256.0
+            if self.args.do_kb_crop is True:
+                height = image.shape[0]
+                width = image.shape[1]
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                image = image[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
+                if self.mode == 'online_eval' and has_valid_depth:
+                    depth_gt = depth_gt[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
+            if self.mode == 'online_eval':
+                sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': has_valid_depth}
+            else:
+                sample = {'image': image, 'focal': focal}
+        if self.transform:
+            sample = self.transform([sample, self.args.dataset])
+        return sample
+    def rotate_image(self, image, angle, flag=Image.BILINEAR):
+        result = image.rotate(angle, resample=flag)
+        return result
+    def random_crop(self, img, depth, height, width):
+        assert img.shape[0] >= height
+        assert img.shape[1] >= width
+        assert img.shape[0] == depth.shape[0]
+        assert img.shape[1] == depth.shape[1]
+        x = random.randint(0, img.shape[1] - width)
+        y = random.randint(0, img.shape[0] - height)
+        img = img[y:y + height, x:x + width, :]
+        depth = depth[y:y + height, x:x + width, :]
+        return img, depth
+    def train_preprocess(self, image, depth_gt):
+        # Random flipping
+        do_flip = random.random()
+        if do_flip > 0.5:
+            image = (image[:, ::-1, :]).copy()
+            depth_gt = (depth_gt[:, ::-1, :]).copy()
+        # Random gamma, brightness, color augmentation
+        do_augment = random.random()
+        if do_augment > 0.5:
+            image = self.augment_image(image)
+        return image, depth_gt
+    def augment_image(self, image):
+        # gamma augmentation
+        gamma = random.uniform(0.9, 1.1)
+        image_aug = image ** gamma
+        # brightness augmentation
+        if self.args.dataset == 'nyu':
+            brightness = random.uniform(0.75, 1.25)
+        else:
+            brightness = random.uniform(0.9, 1.1)
+        image_aug = image_aug * brightness
+        # color augmentation
+        colors = np.random.uniform(0.9, 1.1, size=3)
+        white = np.ones((image.shape[0], image.shape[1]))
+        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
+        image_aug *= color_image
+        image_aug = np.clip(image_aug, 0, 1)
+        return image_aug
+    def Cut_Flip(self, image, depth):
+        p = random.random()
+        if p < 0.5:
+            return image, depth
+        image_copy = copy.deepcopy(image)
+        depth_copy = copy.deepcopy(depth)
+        h, w, c = image.shape
+        N = 2
+        h_list = []
+        h_interval_list = []   # hight interval
+        for i in range(N-1):
+            h_list.append(random.randint(int(0.2*h), int(0.8*h)))
+        h_list.append(h)
+        h_list.append(0)
+        h_list.sort()
+        h_list_inv = np.array([h]*(N+1))-np.array(h_list)
+        for i in range(len(h_list)-1):
+            h_interval_list.append(h_list[i+1]-h_list[i])
+        for i in range(N):
+            image[h_list[i]:h_list[i+1], :, :] = image_copy[h_list_inv[i]-h_interval_list[i]:h_list_inv[i], :, :]
+            depth[h_list[i]:h_list[i+1], :, :] = depth_copy[h_list_inv[i]-h_interval_list[i]:h_list_inv[i], :, :]
+        return image, depth
+    def __len__(self):
+        return len(self.filenames)
+class ToTensor(object):
+    def __init__(self, mode):
+        self.mode = mode
+        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    def __call__(self, sample_dataset):
+        sample = sample_dataset[0]
+        dataset = sample_dataset[1]
+        image, focal = sample['image'], sample['focal']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        if dataset == 'kitti':
+            K_p = np.array([[716.88, 0, 596.5593, 0],
+                  [0, 716.88, 149.854, 0],
+                  [0, 0, 1, 0],
+                  [0, 0, 0, 1]], dtype=np.float32)
+            inv_K_p = np.linalg.pinv(K_p)
+            inv_K_p = torch.from_numpy(inv_K_p)
+        elif dataset == 'nyu':
+            K_p = np.array([[518.8579, 0, 325.5824, 0],
+                  [0, 518.8579, 253.7362, 0],
+                  [0, 0, 1, 0],
+                  [0, 0, 0, 1]], dtype=np.float32)
+            inv_K_p = np.linalg.pinv(K_p)
+            inv_K_p = torch.from_numpy(inv_K_p)
+        if self.mode == 'test':
+            return {'image': image, 'inv_K_p': inv_K_p, 'focal': focal}
+        depth = sample['depth']
+        if self.mode == 'train':
+            depth = self.to_tensor(depth)
+            return {'image': image, 'depth': depth, 'focal': focal}
+        else:
+            has_valid_depth = sample['has_valid_depth']
+            return {'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth}
+    def to_tensor(self, pic):
+        if not (_is_pil_image(pic) or _is_numpy_image(pic)):
+            raise TypeError(
+                'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img

iebins/dataloaders/dataloader_sun.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import torch.utils.data.distributed
+from torchvision import transforms
+import numpy as np
+from PIL import Image
+import os
+import random
+import copy
+import cv2
+from utils import DistributedSamplerNoEvenlyDivisible
+def _is_pil_image(img):
+    return isinstance(img, Image.Image)
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+def preprocessing_transforms(mode):
+    return transforms.Compose([
+        ToTensor(mode=mode)
+    ])
+class NewDataLoader(object):
+    def __init__(self, args, mode):
+        if mode == 'train':
+            self.training_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
+            if args.distributed:
+                self.train_sampler = torch.utils.data.distributed.DistributedSampler(self.training_samples)
+            else:
+                self.train_sampler = None
+            self.data = DataLoader(self.training_samples, args.batch_size,
+                                   shuffle=(self.train_sampler is None),
+                                   num_workers=args.num_threads,
+                                   pin_memory=True,
+                                   sampler=self.train_sampler)
+        elif mode == 'online_eval':
+            self.testing_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
+            if args.distributed:
+                # self.eval_sampler = torch.utils.data.distributed.DistributedSampler(self.testing_samples, shuffle=False)
+                self.eval_sampler = DistributedSamplerNoEvenlyDivisible(self.testing_samples, shuffle=False)
+            else:
+                self.eval_sampler = None
+            self.data = DataLoader(self.testing_samples, 1,
+                                   shuffle=False,
+                                   num_workers=1,
+                                   pin_memory=True,
+                                   sampler=self.eval_sampler)
+        elif mode == 'test':
+            self.testing_samples = DataLoadPreprocess(args, mode, transform=preprocessing_transforms(mode))
+            self.data = DataLoader(self.testing_samples, 1, shuffle=False, num_workers=1)
+        else:
+            print('mode should be one of \'train, test, online_eval\'. Got {}'.format(mode))
+class DataLoadPreprocess(Dataset):
+    def __init__(self, args, mode, transform=None, is_for_online_eval=False):
+        self.args = args
+        if mode == 'online_eval':
+            with open(args.filenames_file_eval, 'r') as f:
+                self.filenames = f.readlines()
+        else:
+            with open(args.filenames_file, 'r') as f:
+                self.filenames = f.readlines()
+        self.mode = mode
+        self.transform = transform
+        self.to_tensor = ToTensor
+        self.is_for_online_eval = is_for_online_eval
+    def __getitem__(self, idx):
+        sample_path = self.filenames[idx]
+        # focal = float(sample_path.split()[2])
+        focal = 518.8579
+        if self.mode == 'train':
+            if self.args.dataset == 'kitti':
+                rgb_file = sample_path.split()[0]
+                depth_file = os.path.join(sample_path.split()[0].split('/')[0], sample_path.split()[1])
+                if self.args.use_right is True and random.random() > 0.5:
+                    rgb_file = rgb_file.replace('image_02', 'image_03')
+                    depth_file = depth_file.replace('image_02', 'image_03')
+            else:
+                rgb_file = sample_path.split()[0]
+                depth_file = sample_path.split()[1]
+            image_path = os.path.join(self.args.data_path, rgb_file)
+            depth_path = os.path.join(self.args.gt_path, depth_file)
+            image = Image.open(image_path)
+            depth_gt = Image.open(depth_path)
+            if self.args.do_kb_crop is True:
+                height = image.height
+                width = image.width
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                depth_gt = depth_gt.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
+                image = image.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            # To avoid blank boundaries due to pixel registration
+            if self.args.dataset == 'nyu':
+                if self.args.input_height == 480:
+                    depth_gt = np.array(depth_gt)
+                    valid_mask = np.zeros_like(depth_gt)
+                    valid_mask[45:472, 43:608] = 1
+                    depth_gt[valid_mask==0] = 0
+                    depth_gt = Image.fromarray(depth_gt)
+                else:
+                    depth_gt = depth_gt.crop((43, 45, 608, 472))
+                    image = image.crop((43, 45, 608, 472))
+            if self.args.do_random_rotate is True:
+                random_angle = (random.random() - 0.5) * 2 * self.args.degree
+                image = self.rotate_image(image, random_angle)
+                depth_gt = self.rotate_image(depth_gt, random_angle, flag=Image.NEAREST)
+            image = np.asarray(image, dtype=np.float32) / 255.0
+            depth_gt = np.asarray(depth_gt, dtype=np.float32)
+            depth_gt = np.expand_dims(depth_gt, axis=2)
+            if self.args.dataset == 'nyu':
+                depth_gt = depth_gt / 1000.0
+            else:
+                depth_gt = depth_gt / 256.0
+            if image.shape[0] != self.args.input_height or image.shape[1] != self.args.input_width:
+                image, depth_gt = self.random_crop(image, depth_gt, self.args.input_height, self.args.input_width)
+            image, depth_gt = self.train_preprocess(image, depth_gt)
+            image, depth_gt = self.Cut_Flip(image, depth_gt)
+            sample = {'image': image, 'depth': depth_gt, 'focal': focal}
+        else:
+            if self.mode == 'online_eval':
+                data_path = self.args.data_path_eval
+            else:
+                data_path = self.args.data_path
+            image_path = os.path.join(data_path, "./" + sample_path.split()[0])
+            image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+            image = cv2.resize(image, (640, 480))
+            if self.mode == 'online_eval':
+                gt_path = self.args.gt_path_eval
+                depth_path = os.path.join(gt_path, "./" + sample_path.split()[1])
+                if self.args.dataset == 'kitti':
+                    depth_path = os.path.join(gt_path, sample_path.split()[0].split('/')[0], sample_path.split()[1])
+                has_valid_depth = False
+                try:
+                    depth_gt = Image.open(depth_path)
+                    has_valid_depth = True
+                except IOError:
+                    depth_gt = False
+                    # print('Missing gt for {}'.format(image_path))
+                if has_valid_depth:
+                    depth_gt = np.asarray(depth_gt, dtype=np.uint16) # 2
+                    depth_gt = np.bitwise_or(np.right_shift(depth_gt, 3), np.left_shift(depth_gt, 16 - 3)) # 3
+                    depth_gt = np.expand_dims(depth_gt, axis=2)
+                    if self.args.dataset == 'nyu':
+                        depth_gt = depth_gt.astype(np.single) / 1000 # 4
+                        depth_gt = depth_gt.astype(np.float32) # 5
+                    else:
+                        depth_gt = depth_gt / 256.0
+            if self.args.do_kb_crop is True:
+                height = image.shape[0]
+                width = image.shape[1]
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                image = image[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
+                if self.mode == 'online_eval' and has_valid_depth:
+                    depth_gt = depth_gt[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
+            if self.mode == 'online_eval':
+                sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': has_valid_depth}
+            else:
+                sample = {'image': image, 'focal': focal}
+        if self.transform:
+            sample = self.transform(sample)
+        return sample
+    def rotate_image(self, image, angle, flag=Image.BILINEAR):
+        result = image.rotate(angle, resample=flag)
+        return result
+    def random_crop(self, img, depth, height, width):
+        assert img.shape[0] >= height
+        assert img.shape[1] >= width
+        assert img.shape[0] == depth.shape[0]
+        assert img.shape[1] == depth.shape[1]
+        x = random.randint(0, img.shape[1] - width)
+        y = random.randint(0, img.shape[0] - height)
+        img = img[y:y + height, x:x + width, :]
+        depth = depth[y:y + height, x:x + width, :]
+        return img, depth
+    def train_preprocess(self, image, depth_gt):
+        # Random flipping
+        do_flip = random.random()
+        if do_flip > 0.5:
+            image = (image[:, ::-1, :]).copy()
+            depth_gt = (depth_gt[:, ::-1, :]).copy()
+        # Random gamma, brightness, color augmentation
+        do_augment = random.random()
+        if do_augment > 0.5:
+            image = self.augment_image(image)
+        return image, depth_gt
+    def augment_image(self, image):
+        # gamma augmentation
+        gamma = random.uniform(0.9, 1.1)
+        image_aug = image ** gamma
+        # brightness augmentation
+        if self.args.dataset == 'nyu':
+            brightness = random.uniform(0.75, 1.25)
+        else:
+            brightness = random.uniform(0.9, 1.1)
+        image_aug = image_aug * brightness
+        # color augmentation
+        colors = np.random.uniform(0.9, 1.1, size=3)
+        white = np.ones((image.shape[0], image.shape[1]))
+        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
+        image_aug *= color_image
+        image_aug = np.clip(image_aug, 0, 1)
+        return image_aug
+    def Cut_Flip(self, image, depth):
+        p = random.random()
+        if p < 0.5:
+            return image, depth
+        image_copy = copy.deepcopy(image)
+        depth_copy = copy.deepcopy(depth)
+        h, w, c = image.shape
+        N = 2
+        h_list = []
+        h_interval_list = []   # hight interval
+        for i in range(N-1):
+            h_list.append(random.randint(int(0.2*h), int(0.8*h)))
+        h_list.append(h)
+        h_list.append(0)
+        h_list.sort()
+        h_list_inv = np.array([h]*(N+1))-np.array(h_list)
+        for i in range(len(h_list)-1):
+            h_interval_list.append(h_list[i+1]-h_list[i])
+        for i in range(N):
+            image[h_list[i]:h_list[i+1], :, :] = image_copy[h_list_inv[i]-h_interval_list[i]:h_list_inv[i], :, :]
+            depth[h_list[i]:h_list[i+1], :, :] = depth_copy[h_list_inv[i]-h_interval_list[i]:h_list_inv[i], :, :]
+        return image, depth
+    def __len__(self):
+        return len(self.filenames)
+class ToTensor(object):
+    def __init__(self, mode):
+        self.mode = mode
+        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    def __call__(self, sample):
+        image, focal = sample['image'], sample['focal']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        if self.mode == 'test':
+            return {'image': image, 'focal': focal}
+        depth = sample['depth']
+        if self.mode == 'train':
+            depth = self.to_tensor(depth)
+            return {'image': image, 'depth': depth, 'focal': focal}
+        else:
+            has_valid_depth = sample['has_valid_depth']
+            return {'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth}
+    def to_tensor(self, pic):
+        if not (_is_pil_image(pic) or _is_numpy_image(pic)):
+            raise TypeError(
+                'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img

iebins/eval.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+import torch.backends.cudnn as cudnn
+import os, sys
+import argparse
+import numpy as np
+from tqdm import tqdm
+from utils import post_process_depth, flip_lr, compute_errors
+from networks.NewCRFDepth import NewCRFDepth
+def convert_arg_line_to_args(arg_line):
+    for arg in arg_line.split():
+        if not arg.strip():
+            continue
+        yield arg
+parser = argparse.ArgumentParser(description='IEBins PyTorch implementation.', fromfile_prefix_chars='@')
+parser.convert_arg_line_to_args = convert_arg_line_to_args
+parser.add_argument('--model_name',                type=str,   help='model name', default='iebins')
+parser.add_argument('--encoder',                   type=str,   help='type of encoder, base07, large07, tiny07', default='large07')
+parser.add_argument('--checkpoint_path',           type=str,   help='path to a checkpoint to load', default='')
+# Dataset
+parser.add_argument('--dataset',                   type=str,   help='dataset to train on, kitti or nyu', default='nyu')
+parser.add_argument('--input_height',              type=int,   help='input height', default=480)
+parser.add_argument('--input_width',               type=int,   help='input width',  default=640)
+parser.add_argument('--max_depth',                 type=float, help='maximum depth in estimation', default=10)
+# Preprocessing
+parser.add_argument('--do_random_rotate',                      help='if set, will perform random rotation for augmentation', action='store_true')
+parser.add_argument('--degree',                    type=float, help='random rotation maximum degree', default=2.5)
+parser.add_argument('--do_kb_crop',                            help='if set, crop input images as kitti benchmark images', action='store_true')
+parser.add_argument('--use_right',                             help='if set, will randomly use right images when train on KITTI', action='store_true')
+# Eval
+parser.add_argument('--data_path_eval',            type=str,   help='path to the data for evaluation', required=False)
+parser.add_argument('--gt_path_eval',              type=str,   help='path to the groundtruth data for evaluation', required=False)
+parser.add_argument('--filenames_file_eval',       type=str,   help='path to the filenames text file for evaluation', required=False)
+parser.add_argument('--min_depth_eval',            type=float, help='minimum depth for evaluation', default=1e-3)
+parser.add_argument('--max_depth_eval',            type=float, help='maximum depth for evaluation', default=80)
+parser.add_argument('--eigen_crop',                            help='if set, crops according to Eigen NIPS14', action='store_true')
+parser.add_argument('--garg_crop',                             help='if set, crops according to Garg  ECCV16', action='store_true')
+if sys.argv.__len__() == 2:
+    arg_filename_with_prefix = '@' + sys.argv[1]
+    args = parser.parse_args([arg_filename_with_prefix])
+else:
+    args = parser.parse_args()
+if args.dataset == 'kitti' or args.dataset == 'nyu':
+    from dataloaders.dataloader import NewDataLoader
+def eval(model, dataloader_eval, post_process=False):
+    eval_measures = torch.zeros(10).cuda()
+    for _, eval_sample_batched in enumerate(tqdm(dataloader_eval.data)):
+        with torch.no_grad():
+            image = torch.autograd.Variable(eval_sample_batched['image'].cuda())
+            gt_depth = eval_sample_batched['depth']
+            has_valid_depth = eval_sample_batched['has_valid_depth']
+            if not has_valid_depth:
+                # print('Invalid depth. continue.')
+                continue
+            pred_depths_r_list, _, _ = model(image)
+            if post_process:
+                image_flipped = flip_lr(image)
+                pred_depths_r_list_flipped, _, _ = model(image_flipped)
+                pred_depth = post_process_depth(pred_depths_r_list[-1], pred_depths_r_list_flipped[-1])
+            pred_depth = pred_depth.cpu().numpy().squeeze()
+            gt_depth = gt_depth.cpu().numpy().squeeze()
+        if args.do_kb_crop:
+            height, width = gt_depth.shape
+            top_margin = int(height - 352)
+            left_margin = int((width - 1216) / 2)
+            pred_depth_uncropped = np.zeros((height, width), dtype=np.float32)
+            pred_depth_uncropped[top_margin:top_margin + 352, left_margin:left_margin + 1216] = pred_depth
+            pred_depth = pred_depth_uncropped
+        pred_depth[pred_depth < args.min_depth_eval] = args.min_depth_eval
+        pred_depth[pred_depth > args.max_depth_eval] = args.max_depth_eval
+        pred_depth[np.isinf(pred_depth)] = args.max_depth_eval
+        pred_depth[np.isnan(pred_depth)] = args.min_depth_eval
+        valid_mask = np.logical_and(gt_depth > args.min_depth_eval, gt_depth < args.max_depth_eval)
+        if args.garg_crop or args.eigen_crop:
+            gt_height, gt_width = gt_depth.shape
+            eval_mask = np.zeros(valid_mask.shape)
+            if args.garg_crop:
+                eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height), int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
+            elif args.eigen_crop:
+                if args.dataset == 'kitti':
+                    eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height), int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
+                elif args.dataset == 'nyu':
+                    eval_mask[45:471, 41:601] = 1
+            valid_mask = np.logical_and(valid_mask, eval_mask)
+        measures = compute_errors(gt_depth[valid_mask], pred_depth[valid_mask])
+        eval_measures[:9] += torch.tensor(measures).cuda()
+        eval_measures[9] += 1
+    eval_measures_cpu = eval_measures.cpu()
+    cnt = eval_measures_cpu[9].item()
+    eval_measures_cpu /= cnt
+    print('Computing errors for {} eval samples'.format(int(cnt)), ', post_process: ', post_process)
+    print("{:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}".format('silog', 'abs_rel', 'log10', 'rms',
+                                                                                    'sq_rel', 'log_rms', 'd1', 'd2',
+                                                                                    'd3'))
+    for i in range(8):
+        print('{:7.4f}, '.format(eval_measures_cpu[i]), end='')
+    print('{:7.4f}'.format(eval_measures_cpu[8]))
+    return eval_measures_cpu
+def main_worker(args):
+    # CRF model
+    model = NewCRFDepth(version=args.encoder, inv_depth=False, max_depth=args.max_depth, pretrained=None)
+    model.train()
+    num_params = sum([np.prod(p.size()) for p in model.parameters()])
+    print("== Total number of parameters: {}".format(num_params))
+    num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
+    print("== Total number of learning parameters: {}".format(num_params_update))
+    model = torch.nn.DataParallel(model)
+    model.cuda()
+    print("== Model Initialized")
+    if args.checkpoint_path != '':
+        if os.path.isfile(args.checkpoint_path):
+            print("== Loading checkpoint '{}'".format(args.checkpoint_path))
+            checkpoint = torch.load(args.checkpoint_path, map_location='cpu')
+            model.load_state_dict(checkpoint['model'])
+            print("== Loaded checkpoint '{}'".format(args.checkpoint_path))
+            del checkpoint
+        else:
+            print("== No checkpoint found at '{}'".format(args.checkpoint_path))
+    cudnn.benchmark = True
+    dataloader_eval = NewDataLoader(args, 'online_eval')
+    # ===== Evaluation ======
+    model.eval()
+    with torch.no_grad():
+        eval_measures = eval(model, dataloader_eval, post_process=True)
+def main():
+    torch.cuda.empty_cache()
+    args.distributed = False
+    ngpus_per_node = torch.cuda.device_count()
+    if ngpus_per_node > 1:
+        print("This machine has more than 1 gpu. Please set \'CUDA_VISIBLE_DEVICES=0\'")
+        return -1
+    main_worker(args)
+if __name__ == '__main__':
+    main()

iebins/eval_sun.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+import os, sys
+import argparse
+import numpy as np
+from tqdm import tqdm
+from utils import post_process_depth, flip_lr, compute_errors
+from networks.NewCRFDepth import NewCRFDepth
+def convert_arg_line_to_args(arg_line):
+    for arg in arg_line.split():
+        if not arg.strip():
+            continue
+        yield arg
+parser = argparse.ArgumentParser(description='IEbins PyTorch implementation.', fromfile_prefix_chars='@')
+parser.convert_arg_line_to_args = convert_arg_line_to_args
+parser.add_argument('--model_name',                type=str,   help='model name', default='iebins')
+parser.add_argument('--encoder',                   type=str,   help='type of encoder, base07, large07, tiny07', default='large07')
+parser.add_argument('--checkpoint_path',           type=str,   help='path to a checkpoint to load', default='')
+# Dataset
+parser.add_argument('--dataset',                   type=str,   help='dataset to train on, kitti or nyu', default='nyu')
+parser.add_argument('--input_height',              type=int,   help='input height', default=480)
+parser.add_argument('--input_width',               type=int,   help='input width',  default=640)
+parser.add_argument('--max_depth',                 type=float, help='maximum depth in estimation', default=10)
+# Preprocessing
+parser.add_argument('--do_random_rotate',                      help='if set, will perform random rotation for augmentation', action='store_true')
+parser.add_argument('--degree',                    type=float, help='random rotation maximum degree', default=2.5)
+parser.add_argument('--do_kb_crop',                            help='if set, crop input images as kitti benchmark images', action='store_true')
+parser.add_argument('--use_right',                             help='if set, will randomly use right images when train on KITTI', action='store_true')
+# Eval
+parser.add_argument('--data_path_eval',            type=str,   help='path to the data for evaluation', required=False)
+parser.add_argument('--gt_path_eval',              type=str,   help='path to the groundtruth data for evaluation', required=False)
+parser.add_argument('--filenames_file_eval',       type=str,   help='path to the filenames text file for evaluation', required=False)
+parser.add_argument('--min_depth_eval',            type=float, help='minimum depth for evaluation', default=1e-3)
+parser.add_argument('--max_depth_eval',            type=float, help='maximum depth for evaluation', default=80)
+parser.add_argument('--eigen_crop',                            help='if set, crops according to Eigen NIPS14', action='store_true')
+parser.add_argument('--garg_crop',                             help='if set, crops according to Garg  ECCV16', action='store_true')
+if sys.argv.__len__() == 2:
+    arg_filename_with_prefix = '@' + sys.argv[1]
+    args = parser.parse_args([arg_filename_with_prefix])
+else:
+    args = parser.parse_args()
+if args.dataset == 'nyu':
+    from dataloaders.dataloader_sun import NewDataLoader
+def eval(model, dataloader_eval, post_process=False):
+    eval_measures = torch.zeros(10).cuda()
+    for _, eval_sample_batched in enumerate(tqdm(dataloader_eval.data)):
+        with torch.no_grad():
+            image = torch.autograd.Variable(eval_sample_batched['image'].cuda())
+            gt_depth = eval_sample_batched['depth']
+            has_valid_depth = eval_sample_batched['has_valid_depth']
+            if not has_valid_depth:
+                # print('Invalid depth. continue.')
+                continue
+            _, hh, ww, _ = gt_depth.shape
+            pred_depths_r_list, _, _ = model(image)
+            if post_process:
+                image_flipped = flip_lr(image)
+                pred_depths_r_list_flipped, _, _ = model(image_flipped)
+                pred_depth = post_process_depth(pred_depths_r_list[-1], pred_depths_r_list_flipped[-1])
+                pred_depth = F.interpolate(pred_depth, [hh, ww], mode="bilinear", align_corners=False)
+            pred_depth = pred_depth.cpu().numpy().squeeze()
+            gt_depth = gt_depth.cpu().numpy().squeeze()
+        if args.do_kb_crop:
+            height, width = gt_depth.shape
+            top_margin = int(height - 352)
+            left_margin = int((width - 1216) / 2)
+            pred_depth_uncropped = np.zeros((height, width), dtype=np.float32)
+            pred_depth_uncropped[top_margin:top_margin + 352, left_margin:left_margin + 1216] = pred_depth
+            pred_depth = pred_depth_uncropped
+        pred_depth[pred_depth < args.min_depth_eval] = args.min_depth_eval
+        pred_depth[pred_depth > args.max_depth_eval] = args.max_depth_eval
+        pred_depth[np.isinf(pred_depth)] = args.max_depth_eval
+        pred_depth[np.isnan(pred_depth)] = args.min_depth_eval
+        pred_depth[pred_depth > 8] = 8
+        gt_depth[gt_depth > 8] = 8
+        valid_mask = np.logical_and(gt_depth > args.min_depth_eval, gt_depth < args.max_depth_eval)
+        if args.garg_crop or args.eigen_crop:
+            gt_height, gt_width = gt_depth.shape
+            eval_mask = np.zeros(valid_mask.shape)
+            if args.garg_crop:
+                eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height), int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
+            elif args.eigen_crop:
+                if args.dataset == 'kitti':
+                    eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height), int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
+                elif args.dataset == 'nyu':
+                    eval_mask[45:471, 41:601] = 1
+            valid_mask = np.logical_and(valid_mask, eval_mask)
+        measures = compute_errors(gt_depth[valid_mask], pred_depth[valid_mask])
+        eval_measures[:9] += torch.tensor(measures).cuda()
+        eval_measures[9] += 1
+    eval_measures_cpu = eval_measures.cpu()
+    cnt = eval_measures_cpu[9].item()
+    eval_measures_cpu /= cnt
+    print('Computing errors for {} eval samples'.format(int(cnt)), ', post_process: ', post_process)
+    print("{:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}".format('silog', 'abs_rel', 'log10', 'rms',
+                                                                                    'sq_rel', 'log_rms', 'd1', 'd2',
+                                                                                    'd3'))
+    for i in range(8):
+        print('{:7.4f}, '.format(eval_measures_cpu[i]), end='')
+    print('{:7.4f}'.format(eval_measures_cpu[8]))
+    return eval_measures_cpu
+def main_worker(args):
+    # CRF model
+    model = NewCRFDepth(version=args.encoder, inv_depth=False, max_depth=args.max_depth, pretrained=None)
+    model.train()
+    num_params = sum([np.prod(p.size()) for p in model.parameters()])
+    print("== Total number of parameters: {}".format(num_params))
+    num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
+    print("== Total number of learning parameters: {}".format(num_params_update))
+    model = torch.nn.DataParallel(model)
+    model.cuda()
+    print("== Model Initialized")
+    if args.checkpoint_path != '':
+        if os.path.isfile(args.checkpoint_path):
+            print("== Loading checkpoint '{}'".format(args.checkpoint_path))
+            checkpoint = torch.load(args.checkpoint_path, map_location='cpu')
+            model.load_state_dict(checkpoint['model'])
+            print("== Loaded checkpoint '{}'".format(args.checkpoint_path))
+            del checkpoint
+        else:
+            print("== No checkpoint found at '{}'".format(args.checkpoint_path))
+    cudnn.benchmark = True
+    dataloader_eval = NewDataLoader(args, 'online_eval')
+    # ===== Evaluation ======
+    model.eval()
+    with torch.no_grad():
+        eval_measures = eval(model, dataloader_eval, post_process=True)
+def main():
+    torch.cuda.empty_cache()
+    args.distributed = False
+    ngpus_per_node = torch.cuda.device_count()
+    if ngpus_per_node > 1:
+        print("This machine has more than 1 gpu. Please set \'CUDA_VISIBLE_DEVICES=0\'")
+        return -1
+    main_worker(args)
+if __name__ == '__main__':
+    main()

iebins/inference_single_image.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+import torch.backends.cudnn as cudnn
+import os, sys
+import argparse
+import numpy as np
+from tqdm import tqdm
+from utils import post_process_depth, flip_lr, compute_errors
+from networks.NewCRFDepth import NewCRFDepth
+from PIL import Image
+from torchvision import transforms
+import matplotlib.pyplot as plt
+def convert_arg_line_to_args(arg_line):
+    for arg in arg_line.split():
+        if not arg.strip():
+            continue
+        yield arg
+parser = argparse.ArgumentParser(description='IEBins PyTorch implementation.', fromfile_prefix_chars='@')
+parser.convert_arg_line_to_args = convert_arg_line_to_args
+parser.add_argument('--model_name',                type=str,   help='model name', default='iebins')
+parser.add_argument('--encoder',                   type=str,   help='type of encoder, base07, large07', default='large07')
+parser.add_argument('--checkpoint_path',           type=str,   help='path to a checkpoint to load', default='')
+parser.add_argument('--dataset',                   type=str,   help='dataset to train on, kitti or nyu', default='nyu')
+parser.add_argument('--image_path',                type=str,   help='path to the image for inference', required=False)
+parser.add_argument('--max_depth',                 type=float, help='maximum depth in estimation', default=10)
+if sys.argv.__len__() == 2:
+    arg_filename_with_prefix = '@' + sys.argv[1]
+    args = parser.parse_args([arg_filename_with_prefix])
+else:
+    args = parser.parse_args()
+def inference(model, post_process=False):
+    image = np.asarray(Image.open(args.image_path), dtype=np.float32) / 255.0
+    if args.dataset == 'kitti':
+        height = image.shape[0]
+        width = image.shape[1]
+        top_margin = int(height - 352)
+        left_margin = int((width - 1216) / 2)
+        image = image[top_margin:top_margin + 352, left_margin:left_margin + 1216, :]
+    image = torch.from_numpy(image.transpose((2, 0, 1)))
+    image = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(image)
+    with torch.no_grad():
+        image = torch.autograd.Variable(image.unsqueeze(0).cuda())
+        pred_depths_r_list, _, _ = model(image)
+        if post_process:
+            image_flipped = flip_lr(image)
+            pred_depths_r_list_flipped, _, _ = model(image_flipped)
+            pred_depth = post_process_depth(pred_depths_r_list[-1], pred_depths_r_list_flipped[-1])
+        pred_depth = pred_depth.cpu().numpy().squeeze()
+        if args.dataset == 'kitti':
+            plt.imsave('depth.png', np.log10(pred_depth), cmap='magma')
+        else:
+            plt.imsave('depth.png', pred_depth, cmap='jet')
+def main_worker(args):
+    model = NewCRFDepth(version=args.encoder, inv_depth=False, max_depth=args.max_depth, pretrained=None)
+    model.train()
+    num_params = sum([np.prod(p.size()) for p in model.parameters()])
+    print("== Total number of parameters: {}".format(num_params))
+    num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
+    print("== Total number of learning parameters: {}".format(num_params_update))
+    model = torch.nn.DataParallel(model)
+    model.cuda()
+    print("== Model Initialized")
+    if args.checkpoint_path != '':
+        if os.path.isfile(args.checkpoint_path):
+            checkpoint = torch.load(args.checkpoint_path, map_location='cpu')
+            model.load_state_dict(checkpoint['model'])
+            print("== Loaded checkpoint '{}'".format(args.checkpoint_path))
+            del checkpoint
+        else:
+            print("== No checkpoint found at '{}'".format(args.checkpoint_path))
+    cudnn.benchmark = True
+    # ===== Inference ======
+    model.eval()
+    with torch.no_grad():
+        inference(model, post_process=True)
+def main():
+    torch.cuda.empty_cache()
+    args.distributed = False
+    ngpus_per_node = torch.cuda.device_count()
+    if ngpus_per_node > 1:
+        print("This machine has more than 1 gpu. Please set \'CUDA_VISIBLE_DEVICES=0\'")
+        return -1
+    main_worker(args)
+if __name__ == '__main__':
+    main()

iebins/networks/NewCRFDepth.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .swin_transformer import SwinTransformer
+from .newcrf_layers import NewCRF
+from .uper_crf_head import PSP
+from .depth_update  import *
+########################################################################################################################
+class NewCRFDepth(nn.Module):
+    """
+    Depth network based on neural window FC-CRFs architecture.
+    """
+    def __init__(self, version=None, inv_depth=False, pretrained=None,
+                    frozen_stages=-1, min_depth=0.1, max_depth=100.0, **kwargs):
+        super().__init__()
+        self.inv_depth = inv_depth
+        self.with_auxiliary_head = False
+        self.with_neck = False
+        norm_cfg = dict(type='BN', requires_grad=True)
+        window_size = int(version[-2:])
+        if version[:-2] == 'base':
+            embed_dim = 128
+            depths = [2, 2, 18, 2]
+            num_heads = [4, 8, 16, 32]
+            in_channels = [128, 256, 512, 1024]
+            self.update = BasicUpdateBlockDepth(hidden_dim=128, context_dim=128)
+        elif version[:-2] == 'large':
+            embed_dim = 192
+            depths = [2, 2, 18, 2]
+            num_heads = [6, 12, 24, 48]
+            in_channels = [192, 384, 768, 1536]
+            self.update = BasicUpdateBlockDepth(hidden_dim=128, context_dim=192)
+        elif version[:-2] == 'tiny':
+            embed_dim = 96
+            depths = [2, 2, 6, 2]
+            num_heads = [3, 6, 12, 24]
+            in_channels = [96, 192, 384, 768]
+            self.update = BasicUpdateBlockDepth(hidden_dim=128, context_dim=96)
+        backbone_cfg = dict(
+            embed_dim=embed_dim,
+            depths=depths,
+            num_heads=num_heads,
+            window_size=window_size,
+            ape=False,
+            drop_path_rate=0.3,
+            patch_norm=True,
+            use_checkpoint=False,
+            frozen_stages=frozen_stages
+        )
+        embed_dim = 512
+        decoder_cfg = dict(
+            in_channels=in_channels,
+            in_index=[0, 1, 2, 3],
+            pool_scales=(1, 2, 3, 6),
+            channels=embed_dim,
+            dropout_ratio=0.0,
+            num_classes=32,
+            norm_cfg=norm_cfg,
+            align_corners=False
+        )
+        self.backbone = SwinTransformer(**backbone_cfg)
+        v_dim = decoder_cfg['num_classes']*4
+        win = 7
+        crf_dims = [128, 256, 512, 1024]
+        v_dims = [64, 128, 256, embed_dim]
+        self.crf3 = NewCRF(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, v_dim=v_dims[3], num_heads=32)
+        self.crf2 = NewCRF(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, v_dim=v_dims[2], num_heads=16)
+        self.crf1 = NewCRF(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, v_dim=v_dims[1], num_heads=8)
+        self.decoder = PSP(**decoder_cfg)
+        self.disp_head1 = DispHead(input_dim=crf_dims[0])
+        self.up_mode = 'bilinear'
+        if self.up_mode == 'mask':
+            self.mask_head = nn.Sequential(
+                nn.Conv2d(v_dims[0], 64, 3, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(64, 16*9, 1, padding=0))
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.depth_num = 16
+        self.hidden_dim = 128
+        self.project = Projection(v_dims[0], self.hidden_dim)
+        self.init_weights(pretrained=pretrained)
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone and heads.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        print(f'== Load encoder backbone from: {pretrained}')
+        self.backbone.init_weights(pretrained=pretrained)
+        self.decoder.init_weights()
+        if self.with_auxiliary_head:
+            if isinstance(self.auxiliary_head, nn.ModuleList):
+                for aux_head in self.auxiliary_head:
+                    aux_head.init_weights()
+            else:
+                self.auxiliary_head.init_weights()
+    def upsample_mask(self, disp, mask):
+        """ Upsample disp [H/4, W/4, 1] -> [H, W, 1] using convex combination """
+        N, C, H, W = disp.shape
+        mask = mask.view(N, 1, 9, 4, 4, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_disp = F.unfold(disp, kernel_size=3, padding=1)
+        up_disp = up_disp.view(N, C, 9, 1, 1, H, W)
+        up_disp = torch.sum(mask * up_disp, dim=2)
+        up_disp = up_disp.permute(0, 1, 4, 2, 5, 3)
+        return up_disp.reshape(N, C, 4*H, 4*W)
+    def forward(self, imgs, epoch=1, step=100):
+        feats = self.backbone(imgs)
+        ppm_out = self.decoder(feats)
+        e3 = self.crf3(feats[3], ppm_out)
+        e3 = nn.PixelShuffle(2)(e3)
+        e2 = self.crf2(feats[2], e3)
+        e2 = nn.PixelShuffle(2)(e2)
+        e1 = self.crf1(feats[1], e2)
+        e1 = nn.PixelShuffle(2)(e1)
+        # iterative bins
+        if epoch == 0 and step < 80:
+            max_tree_depth = 3
+        else:
+            max_tree_depth = 6
+        if self.up_mode == 'mask':
+            mask = self.mask_head(e1)
+        b, c, h, w = e1.shape
+        device = e1.device
+        depth = torch.zeros([b, 1, h, w]).to(device)
+        context = feats[0]
+        gru_hidden = torch.tanh(self.project(e1))
+        pred_depths_r_list, pred_depths_c_list, uncertainty_maps_list = self.update(depth, context, gru_hidden, max_tree_depth, self.depth_num, self.min_depth, self.max_depth)
+        if self.up_mode == 'mask':
+            for i in range(len(pred_depths_r_list)):
+                pred_depths_r_list[i] = self.upsample_mask(pred_depths_r_list[i], mask)
+            for i in range(len(pred_depths_c_list)):
+                pred_depths_c_list[i] = self.upsample_mask(pred_depths_c_list[i], mask.detach())
+            for i in range(len(uncertainty_maps_list)):
+                uncertainty_maps_list[i] = self.upsample_mask(uncertainty_maps_list[i], mask.detach())
+        else:
+            for i in range(len(pred_depths_r_list)):
+                pred_depths_r_list[i] = upsample(pred_depths_r_list[i], scale_factor=4)
+            for i in range(len(pred_depths_c_list)):
+                pred_depths_c_list[i] = upsample(pred_depths_c_list[i], scale_factor=4)
+            for i in range(len(uncertainty_maps_list)):
+                uncertainty_maps_list[i] = upsample(uncertainty_maps_list[i], scale_factor=4)
+        return pred_depths_r_list, pred_depths_c_list, uncertainty_maps_list
+class DispHead(nn.Module):
+    def __init__(self, input_dim=100):
+        super(DispHead, self).__init__()
+        # self.norm1 = nn.BatchNorm2d(input_dim)
+        self.conv1 = nn.Conv2d(input_dim, 1, 3, padding=1)
+        # self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, scale):
+        # x = self.relu(self.norm1(x))
+        x = self.sigmoid(self.conv1(x))
+        if scale > 1:
+            x = upsample(x, scale_factor=scale)
+        return x
+class BasicUpdateBlockDepth(nn.Module):
+    def __init__(self, hidden_dim=128, context_dim=192):
+        super(BasicUpdateBlockDepth, self).__init__()
+        self.encoder = ProjectionInputDepth(hidden_dim=hidden_dim, out_chs=hidden_dim * 2)
+        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=self.encoder.out_chs+context_dim)
+        self.p_head = PHead(hidden_dim, hidden_dim)
+    def forward(self, depth, context, gru_hidden, seq_len, depth_num, min_depth, max_depth):
+        pred_depths_r_list = []
+        pred_depths_c_list = []
+        uncertainty_maps_list = []
+        b, _, h, w = depth.shape
+        depth_range = max_depth - min_depth
+        interval = depth_range / depth_num
+        interval = interval * torch.ones_like(depth)
+        interval = interval.repeat(1, depth_num, 1, 1)
+        interval = torch.cat([torch.ones_like(depth) * min_depth, interval], 1)
+        bin_edges = torch.cumsum(interval, 1)
+        current_depths = 0.5 * (bin_edges[:, :-1] + bin_edges[:, 1:])
+        index_iter = 0
+        for i in range(seq_len):
+            input_features = self.encoder(current_depths.detach())
+            input_c = torch.cat([input_features, context], dim=1)
+            gru_hidden = self.gru(gru_hidden, input_c)
+            pred_prob = self.p_head(gru_hidden)
+            depth_r = (pred_prob * current_depths.detach()).sum(1, keepdim=True)
+            pred_depths_r_list.append(depth_r)
+            uncertainty_map = torch.sqrt((pred_prob * ((current_depths.detach() - depth_r.repeat(1, depth_num, 1, 1))**2)).sum(1, keepdim=True))
+            uncertainty_maps_list.append(uncertainty_map)
+            index_iter = index_iter + 1
+            pred_label = get_label(torch.squeeze(depth_r, 1), bin_edges, depth_num).unsqueeze(1)
+            depth_c = torch.gather(current_depths.detach(), 1, pred_label.detach())
+            pred_depths_c_list.append(depth_c)
+            label_target_bin_left = pred_label
+            target_bin_left = torch.gather(bin_edges, 1, label_target_bin_left)
+            label_target_bin_right = (pred_label.float() + 1).long()
+            target_bin_right = torch.gather(bin_edges, 1, label_target_bin_right)
+            bin_edges, current_depths = update_sample(bin_edges, target_bin_left, target_bin_right, depth_r.detach(), pred_label.detach(), depth_num, min_depth, max_depth, uncertainty_map)
+        return pred_depths_r_list, pred_depths_c_list, uncertainty_maps_list
+class PHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=128):
+        super(PHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 16, 3, padding=1)
+    def forward(self, x):
+        out = torch.softmax(self.conv2(F.relu(self.conv1(x))), 1)
+        return out
+class SepConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=128+192):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+        self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+        self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1)))
+        h = (1-z) * h + z * q
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1)))
+        h = (1-z) * h + z * q
+        return h
+class ProjectionInputDepth(nn.Module):
+    def __init__(self, hidden_dim, out_chs):
+        super().__init__()
+        self.out_chs = out_chs
+        self.convd1 = nn.Conv2d(16, hidden_dim, 7, padding=3)
+        self.convd2 = nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1)
+        self.convd3 = nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1)
+        self.convd4 = nn.Conv2d(hidden_dim, out_chs, 3, padding=1)
+    def forward(self, depth):
+        d = F.relu(self.convd1(depth))
+        d = F.relu(self.convd2(d))
+        d = F.relu(self.convd3(d))
+        d = F.relu(self.convd4(d))
+        return d
+class Projection(nn.Module):
+    def __init__(self, in_chs, out_chs):
+        super().__init__()
+        self.conv = nn.Conv2d(in_chs, out_chs, 3, padding=1)
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+def upsample(x, scale_factor=2, mode="bilinear", align_corners=False):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=scale_factor, mode=mode, align_corners=align_corners)
+def upsample1(x, scale_factor=2, mode="bilinear"):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=scale_factor, mode=mode)

iebins/networks/__init__.py ADDED Viewed

File without changes

iebins/networks/depth_update.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn.functional as F
+import copy
+def update_sample(bin_edges, target_bin_left, target_bin_right, depth_r, pred_label, depth_num, min_depth, max_depth, uncertainty_range):
+    with torch.no_grad():
+        b, _, h, w = bin_edges.shape
+        mode = 'direct'
+        if mode == 'direct':
+            depth_range = uncertainty_range
+            depth_start_update = torch.clamp_min(depth_r - 0.5 * depth_range, min_depth)
+        else:
+            depth_range = uncertainty_range + (target_bin_right - target_bin_left).abs()
+            depth_start_update = torch.clamp_min(target_bin_left - 0.5 * uncertainty_range, min_depth)
+        interval = depth_range / depth_num
+        interval = interval.repeat(1, depth_num, 1, 1)
+        interval = torch.cat([torch.ones([b, 1, h, w], device=bin_edges.device) * depth_start_update, interval], 1)
+        bin_edges = torch.cumsum(interval, 1).clamp(min_depth, max_depth)
+        curr_depth = 0.5 * (bin_edges[:, :-1] + bin_edges[:, 1:])
+    return bin_edges.detach(), curr_depth.detach()
+def get_label(gt_depth_img, bin_edges, depth_num):
+    with torch.no_grad():
+        gt_label = torch.zeros(gt_depth_img.size(), dtype=torch.int64, device=gt_depth_img.device)
+        for i in range(depth_num):
+            bin_mask = torch.ge(gt_depth_img, bin_edges[:, i])
+            bin_mask = torch.logical_and(bin_mask,
+                torch.lt(gt_depth_img, bin_edges[:, i + 1]))
+            gt_label[bin_mask] = i
+        return gt_label

iebins/networks/newcrf_layers.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, v_dim, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(v_dim, v_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, v, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qk = self.qk(x).reshape(B_, N, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k = qk[0], qk[1]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        # assert self.dim % v.shape[-1] == 0, "self.dim % v.shape[-1] != 0"
+        # repeat_num = self.dim // v.shape[-1]
+        # v = v.view(B_, N, self.num_heads // repeat_num, -1).transpose(1, 2).repeat(1, repeat_num, 1, 1)
+        assert self.dim == v.shape[-1], "self.dim != v.shape[-1]"
+        v = v.view(B_, N, self.num_heads, -1).transpose(1, 2)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CRFBlock(nn.Module):
+    """ CRF Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, v_dim, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.v_dim = v_dim
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, v_dim=v_dim,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(v_dim)
+        mlp_hidden_dim = int(v_dim * mlp_ratio)
+        self.mlp = Mlp(in_features=v_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, v, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        v = F.pad(v, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            shifted_v = torch.roll(v, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            shifted_v = v
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        v_windows = window_partition(shifted_v, self.window_size)  # nW*B, window_size, window_size, C
+        v_windows = v_windows.view(-1, self.window_size * self.window_size, v_windows.shape[-1])  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, v_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, self.v_dim)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, self.v_dim)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class BasicCRFLayer(nn.Module):
+    """ A basic NeWCRFs layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 v_dim,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            CRFBlock(
+                dim=dim,
+                num_heads=num_heads,
+                v_dim=v_dim,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, v, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, v, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class NewCRF(nn.Module):
+    def __init__(self,
+                 input_dim=96,
+                 embed_dim=96,
+                 v_dim=64,
+                 window_size=7,
+                 num_heads=4,
+                 depth=2,
+                 patch_size=4,
+                 in_chans=3,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=True):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        if input_dim != embed_dim:
+            self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1)
+        else:
+            self.proj_x = None
+        if v_dim != embed_dim:
+            self.proj_v = nn.Conv2d(v_dim, embed_dim, 3, padding=1)
+        elif embed_dim % v_dim == 0:
+            self.proj_v = None
+        # For now, v_dim need to be equal to embed_dim, because the output of window-attn is the input of shift-window-attn
+        v_dim = embed_dim
+        assert v_dim == embed_dim
+        self.crf_layer = BasicCRFLayer(
+                dim=embed_dim,
+                depth=depth,
+                num_heads=num_heads,
+                v_dim=v_dim,
+                window_size=window_size,
+                mlp_ratio=4.,
+                qkv_bias=True,
+                qk_scale=None,
+                drop=0.,
+                attn_drop=0.,
+                drop_path=0.,
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=False)
+        layer = norm_layer(embed_dim)
+        layer_name = 'norm_crf'
+        self.add_module(layer_name, layer)
+    def forward(self, x, v):
+        if self.proj_x is not None:
+            x = self.proj_x(x)
+        if self.proj_v is not None:
+            v = self.proj_v(v)
+        Wh, Ww = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)
+        v = v.transpose(1, 2).transpose(2, 3)
+        x_out, H, W, x, Wh, Ww = self.crf_layer(x, v, Wh, Ww)
+        norm_layer = getattr(self, f'norm_crf')
+        x_out = norm_layer(x_out)
+        out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1, 2).contiguous()
+        return out

iebins/networks/newcrf_utils.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import warnings
+import os
+import os.path as osp
+import pkgutil
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+import torch
+import torchvision
+import torch.nn as nn
+from torch.utils import model_zoo
+from torch.nn import functional as F
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+from torch import distributed as dist
+TORCH_VERSION = torch.__version__
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    if isinstance(size, torch.Size):
+        size = tuple(int(x) for x in size)
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+def is_module_wrapper(module):
+    module_wrappers = (DataParallel, DistributedDataParallel)
+    return isinstance(module, module_wrappers)
+def get_dist_info():
+    if TORCH_VERSION < '1.0':
+        initialized = dist._initialized
+    else:
+        if dist.is_available():
+            initialized = dist.is_initialized()
+        else:
+            initialized = False
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    load(module)
+    load = None  # break load->load reference cycle
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+def load_url_dist(url, model_dir=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    return checkpoint
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+    # reshape absolute position embedding
+    if state_dict.get('absolute_pos_embed') is not None:
+        absolute_pos_embed = state_dict['absolute_pos_embed']
+        N1, L, C1 = absolute_pos_embed.size()
+        N2, C2, H, W = model.absolute_pos_embed.size()
+        if N1 != N2 or C1 != C2 or L != H*W:
+            logger.warning("Error in loading absolute_pos_embed, pass")
+        else:
+            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
+    # interpolate position bias table if needed
+    relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+    for table_key in relative_position_bias_table_keys:
+        table_pretrained = state_dict[table_key]
+        table_current = model.state_dict()[table_key]
+        L1, nH1 = table_pretrained.size()
+        L2, nH2 = table_current.size()
+        if nH1 != nH2:
+            logger.warning(f"Error in loading {table_key}, pass")
+        else:
+            if L1 != L2:
+                S1 = int(L1 ** 0.5)
+                S2 = int(L2 ** 0.5)
+                table_pretrained_resized = F.interpolate(
+                     table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+                     size=(S2, S2), mode='bicubic')
+                state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint

iebins/networks/resize.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+import torch.nn as nn
+import torch.nn.functional as F
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=False):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+class Upsample(nn.Module):
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)

iebins/networks/swin_transformer.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from .newcrf_utils import load_checkpoint
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            # logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        return tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()

iebins/networks/uper_crf_head.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from .newcrf_utils import resize, normal_init
+class PPM(nn.ModuleList):
+    """Pooling Pyramid Module used in PSPNet.
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+    """
+    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg, align_corners):
+        super(PPM, self).__init__()
+        self.pool_scales = pool_scales
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for pool_scale in pool_scales:
+            # == if batch size = 1, BN is not supported, change to GN
+            if pool_scale == 1: norm_cfg = dict(type='GN', requires_grad=True, num_groups=256)
+            self.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(pool_scale),
+                    ConvModule(
+                        self.in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=self.act_cfg)))
+    def forward(self, x):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = resize(
+                ppm_out,
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+class BaseDecodeHead(nn.Module):
+    """Base class for BaseDecodeHead.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False):
+        super(BaseDecodeHead, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        # self.loss_decode = build_loss(loss_decode)
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        # if sampler is not None:
+        #     self.sampler = build_pixel_sampler(sampler, context=self)
+        # else:
+        #     self.sampler = None
+        # self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        # self.conv1 = nn.Conv2d(channels, num_classes, 3, padding=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        # normal_init(self.conv_seg, mean=0, std=0.01)
+        # normal_init(self.conv1, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+class UPerHead(BaseDecodeHead):
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super(UPerHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        # FPN Module
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        for in_channels in self.in_channels:  # skip the top layer
+            l_conv = ConvModule(
+                in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=True)
+            fpn_conv = ConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=True)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # laterals.append(self.psp_forward(inputs))
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] += resize(
+                laterals[i],
+                size=prev_shape,
+                mode='bilinear',
+                align_corners=self.align_corners)
+        # build outputs
+        fpn_outs = [
+            self.fpn_convs[i](laterals[i])
+            for i in range(used_backbone_levels - 1)
+        ]
+        # append psp feature
+        fpn_outs.append(laterals[-1])
+        return fpn_outs[0]
+class PSP(BaseDecodeHead):
+    """Unified Perceptual Parsing for Scene Understanding.
+    This head is the implementation of `UPerNet
+    <https://arxiv.org/abs/1807.10221>`_.
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module applied on the last feature. Default: (1, 2, 3, 6).
+    """
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super(PSP, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        # PSP Module
+        self.psp_modules = PPM(
+            pool_scales,
+            self.in_channels[-1],
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels[-1] + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+    def psp_forward(self, inputs):
+        """Forward function of PSP module."""
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        output = self.bottleneck(psp_outs)
+        return output
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+        return self.psp_forward(inputs)

iebins/sum_depth.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+import numpy as np
+class Sum_depth(nn.Module):
+    def __init__(self):
+        super(Sum_depth, self).__init__()
+        self.sum_conv = nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1, bias=False)
+        sum_k = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
+        sum_k = torch.from_numpy(sum_k).float().view(1, 1, 3, 3)
+        self.sum_conv.weight = nn.Parameter(sum_k)
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        out = self.sum_conv(x)
+        out = out.contiguous().view(-1, 1, x.size(2), x.size(3))
+        return out

iebins/test.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from __future__ import absolute_import, division, print_function
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import os, sys, errno
+import argparse
+import time
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+import open3d as o3d
+from utils import post_process_depth, D_to_cloud, flip_lr, inv_normalize
+from networks.NewCRFDepth import NewCRFDepth
+def convert_arg_line_to_args(arg_line):
+    for arg in arg_line.split():
+        if not arg.strip():
+            continue
+        yield arg
+parser = argparse.ArgumentParser(description='IEBins PyTorch implementation.', fromfile_prefix_chars='@')
+parser.convert_arg_line_to_args = convert_arg_line_to_args
+parser.add_argument('--model_name', type=str, help='model name', default='iebins')
+parser.add_argument('--encoder', type=str, help='type of encoder, base07, large07, tiny07', default='large07')
+parser.add_argument('--data_path', type=str, help='path to the data', required=True)
+parser.add_argument('--filenames_file', type=str, help='path to the filenames text file', required=True)
+parser.add_argument('--input_height', type=int, help='input height', default=480)
+parser.add_argument('--input_width', type=int, help='input width', default=640)
+parser.add_argument('--max_depth', type=float, help='maximum depth in estimation', default=10)
+parser.add_argument('--checkpoint_path', type=str, help='path to a specific checkpoint to load', default='')
+parser.add_argument('--dataset', type=str, help='dataset to train on', default='nyu')
+parser.add_argument('--do_kb_crop', help='if set, crop input images as kitti benchmark images', action='store_true')
+parser.add_argument('--pred_clouds', help='if set, pred cloud points', action='store_true')
+parser.add_argument('--save_viz', help='if set, save visulization of the outputs', action='store_true')
+if sys.argv.__len__() == 2:
+    arg_filename_with_prefix = '@' + sys.argv[1]
+    args = parser.parse_args([arg_filename_with_prefix])
+else:
+    args = parser.parse_args()
+if args.dataset == 'kitti' or args.dataset == 'nyu':
+    from dataloaders.dataloader import NewDataLoader
+model_dir = os.path.dirname(args.checkpoint_path)
+sys.path.append(model_dir)
+def get_num_lines(file_path):
+    f = open(file_path, 'r')
+    lines = f.readlines()
+    f.close()
+    return len(lines)
+def test(params):
+    """Test function."""
+    args.mode = 'test'
+    dataloader = NewDataLoader(args, 'test')
+    model = NewCRFDepth(version='large07', inv_depth=False, max_depth=args.max_depth)
+    model = torch.nn.DataParallel(model)
+    checkpoint = torch.load(args.checkpoint_path)
+    model.load_state_dict(checkpoint['model'])
+    model.eval()
+    model.cuda()
+    num_params = sum([np.prod(p.size()) for p in model.parameters()])
+    print("Total number of parameters: {}".format(num_params))
+    num_test_samples = get_num_lines(args.filenames_file)
+    with open(args.filenames_file) as f:
+        lines = f.readlines()
+    print('now testing {} files with {}'.format(num_test_samples, args.checkpoint_path))
+    pred_depths = []
+    pred_clouds = []
+    start_time = time.time()
+    with torch.no_grad():
+        for _, sample in enumerate(tqdm(dataloader.data)):
+            image = Variable(sample['image'].cuda())
+            inv_K_p = Variable(sample['inv_K_p'].cuda())
+            b, _, h, w = image.shape
+            depth_to_cloud = D_to_cloud(b, h, w).cuda()
+            # Predict
+            pred_depths_r_list, _, _ = model(image)
+            post_process = True
+            if post_process:
+                image_flipped = flip_lr(image)
+                pred_depths_r_list_flipped, _, _ = model(image_flipped)
+                pred_depth = post_process_depth(pred_depths_r_list[-1], pred_depths_r_list_flipped[-1])
+            if args.pred_clouds:
+                if args.dataset == 'nyu':
+                    color = inv_normalize(image[0, :, :, :]).permute(1, 2, 0)[45:472, 43:608, :].reshape(-1, 3).cpu().numpy()
+                    points = depth_to_cloud(pred_depth, inv_K_p).reshape(1, h, w, 3)[:, 45:472, 43:608, :].reshape(1, -1, 3)
+                    points = points.cpu().numpy().squeeze()
+                else:
+                    color = inv_normalize(image[0, :, :, :]).permute(1, 2, 0).reshape(-1, 3).cpu().numpy()
+                    points = depth_to_cloud(pred_depth, inv_K_p)
+                    points = points.cpu().numpy().squeeze()
+                pc = o3d.geometry.PointCloud()
+                pc.points = o3d.utility.Vector3dVector(points)
+                pc.colors = o3d.utility.Vector3dVector(color)
+                pred_clouds.append(pc)
+            pred_depth = pred_depth.cpu().numpy().squeeze()
+            if args.do_kb_crop:
+                height, width = 352, 1216
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                pred_depth_uncropped = np.zeros((height, width), dtype=np.float32)
+                pred_depth_uncropped[top_margin:top_margin + 352, left_margin:left_margin + 1216] = pred_depth
+                pred_depth = pred_depth_uncropped
+            pred_depths.append(pred_depth)
+    elapsed_time = time.time() - start_time
+    print('Elapesed time: %s' % str(elapsed_time))
+    print('Done.')
+    save_name = 'models/result_' + args.model_name
+    print('Saving result pngs..')
+    if not os.path.exists(save_name):
+        try:
+            os.mkdir(save_name)
+            os.mkdir(save_name + '/raw')
+            os.mkdir(save_name + '/cmap')
+            os.mkdir(save_name + '/rgb')
+            os.mkdir(save_name + '/gt')
+            os.mkdir(save_name + '/cloud')
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+    for s in tqdm(range(num_test_samples)):
+        if args.dataset == 'kitti':
+            date_drive = lines[s].split('/')[1]
+            filename_pred_png = save_name + '/raw/' + date_drive + '_' + lines[s].split()[0].split('/')[-1].replace(
+                '.jpg', '.png')
+            filename_pred_ply = save_name + '/cloud/' + date_drive + '_' + lines[s].split()[0].split('/')[-1][:-4] + '_' + 'iebins' + '.ply'
+            filename_cmap_png = save_name + '/cmap/' + date_drive + '_' + lines[s].split()[0].split('/')[
+                -1].replace('.jpg', '.png')
+            filename_image_png = save_name + '/rgb/' + date_drive + '_' + lines[s].split()[0].split('/')[-1]
+        elif args.dataset == 'kittipred':
+            filename_pred_png = save_name + '/raw/' + lines[s].split()[0].split('/')[-1].replace('.jpg', '.png')
+            filename_cmap_png = save_name + '/cmap/' + lines[s].split()[0].split('/')[-1].replace('.jpg', '.png')
+            filename_image_png = save_name + '/rgb/' + lines[s].split()[0].split('/')[-1]
+        else:
+            scene_name = lines[s].split()[0].split('/')[0]
+            filename_pred_png = save_name + '/raw/' + scene_name + '_' + lines[s].split()[0].split('/')[1].replace(
+                '.jpg', '.png')
+            filename_pred_ply = save_name + '/cloud/' + scene_name + '_' + lines[s].split()[0].split('/')[1][:-4] + '_' + 'iebins' + '.ply'
+            filename_cmap_png = save_name + '/cmap/' + scene_name + '_' + lines[s].split()[0].split('/rgb_')[1].replace(
+                '.jpg', '.png')
+            filename_gt_png = save_name + '/gt/' + scene_name + '_' + lines[s].split()[0].split('/rgb_')[1].replace(
+                '.jpg', '_gt.png')
+            filename_image_png = save_name + '/rgb/' + scene_name + '_' + lines[s].split()[0].split('/rgb_')[1]
+        rgb_path = os.path.join(args.data_path, './' + lines[s].split()[0])
+        image = cv2.imread(rgb_path)
+        if args.dataset == 'nyu':
+            gt_path = os.path.join(args.data_path, './' + lines[s].split()[1])
+            gt = cv2.imread(gt_path, -1).astype(np.float32) / 1000.0  # Visualization purpose only
+            gt[gt == 0] = np.amax(gt)
+        pred_depth = pred_depths[s]
+        if args.dataset == 'kitti' or args.dataset == 'kittipred':
+            pred_depth_scaled = pred_depth * 256.0
+        else:
+            pred_depth_scaled = pred_depth * 1000.0
+        pred_depth_scaled = pred_depth_scaled.astype(np.uint16)
+        cv2.imwrite(filename_pred_png, pred_depth_scaled, [cv2.IMWRITE_PNG_COMPRESSION, 0])
+        if args.save_viz:
+            cv2.imwrite(filename_image_png, image[10:-1 - 9, 10:-1 - 9, :])
+            if args.dataset == 'nyu':
+                plt.imsave(filename_gt_png, (10 - gt) / 10, cmap='jet')
+                pred_depth_cropped = pred_depth[10:-1 - 9, 10:-1 - 9]
+                plt.imsave(filename_cmap_png, (10 - pred_depth) / 10, cmap='jet')
+            else:
+                plt.imsave(filename_cmap_png, np.log10(pred_depth), cmap='magma')
+        if args.pred_clouds:
+            pred_cloud = pred_clouds[s]
+            o3d.io.write_point_cloud(filename_pred_ply, pred_cloud)
+    return
+if __name__ == '__main__':
+    test(args)

iebins/train.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import torch
+import torch.nn as nn
+import torch.nn.utils as utils
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os, sys, time
+from telnetlib import IP
+import argparse
+import numpy as np
+from tqdm import tqdm
+from tensorboardX import SummaryWriter
+from utils import post_process_depth, flip_lr, silog_loss, compute_errors, eval_metrics, entropy_loss, colormap, \
+                       block_print, enable_print, normalize_result, inv_normalize, convert_arg_line_to_args, colormap_magma
+from networks.NewCRFDepth import NewCRFDepth
+from networks.depth_update import *
+from datetime import datetime
+from sum_depth import Sum_depth
+parser = argparse.ArgumentParser(description='IEBins PyTorch implementation.', fromfile_prefix_chars='@')
+parser.convert_arg_line_to_args = convert_arg_line_to_args
+parser.add_argument('--mode',                      type=str,   help='train or test', default='train')
+parser.add_argument('--model_name',                type=str,   help='model name', default='iebins')
+parser.add_argument('--encoder',                   type=str,   help='type of encoder, base07, large07, tiny07', default='large07')
+parser.add_argument('--pretrain',                  type=str,   help='path of pretrained encoder', default=None)
+# Dataset
+parser.add_argument('--dataset',                   type=str,   help='dataset to train on, kitti or nyu', default='nyu')
+parser.add_argument('--data_path',                 type=str,   help='path to the data', required=True)
+parser.add_argument('--gt_path',                   type=str,   help='path to the groundtruth data', required=True)
+parser.add_argument('--filenames_file',            type=str,   help='path to the filenames text file', required=True)
+parser.add_argument('--input_height',              type=int,   help='input height', default=480)
+parser.add_argument('--input_width',               type=int,   help='input width',  default=640)
+parser.add_argument('--max_depth',                 type=float, help='maximum depth in estimation', default=10)
+parser.add_argument('--min_depth',                 type=float, help='minimum depth in estimation', default=0.1)
+# Log and save
+parser.add_argument('--log_directory',             type=str,   help='directory to save checkpoints and summaries', default='')
+parser.add_argument('--checkpoint_path',           type=str,   help='path to a checkpoint to load', default='')
+parser.add_argument('--log_freq',                  type=int,   help='Logging frequency in global steps', default=100)
+parser.add_argument('--save_freq',                 type=int,   help='Checkpoint saving frequency in global steps', default=5000)
+# Training
+parser.add_argument('--weight_decay',              type=float, help='weight decay factor for optimization', default=1e-2)
+parser.add_argument('--retrain',                               help='if used with checkpoint_path, will restart training from step zero', action='store_true')
+parser.add_argument('--adam_eps',                  type=float, help='epsilon in Adam optimizer', default=1e-6)
+parser.add_argument('--batch_size',                type=int,   help='batch size', default=4)
+parser.add_argument('--num_epochs',                type=int,   help='number of epochs', default=50)
+parser.add_argument('--learning_rate',             type=float, help='initial learning rate', default=1e-4)
+parser.add_argument('--end_learning_rate',         type=float, help='end learning rate', default=-1)
+parser.add_argument('--variance_focus',            type=float, help='lambda in paper: [0, 1], higher value more focus on minimizing variance of error', default=0.85)
+# Preprocessing
+parser.add_argument('--do_random_rotate',                      help='if set, will perform random rotation for augmentation', action='store_true')
+parser.add_argument('--degree',                    type=float, help='random rotation maximum degree', default=2.5)
+parser.add_argument('--do_kb_crop',                            help='if set, crop input images as kitti benchmark images', action='store_true')
+parser.add_argument('--use_right',                             help='if set, will randomly use right images when train on KITTI', action='store_true')
+# Multi-gpu training
+parser.add_argument('--num_threads',               type=int,   help='number of threads to use for data loading', default=1)
+parser.add_argument('--world_size',                type=int,   help='number of nodes for distributed training', default=1)
+parser.add_argument('--rank',                      type=int,   help='node rank for distributed training', default=0)
+parser.add_argument('--dist_url',                  type=str,   help='url used to set up distributed training', default='tcp://127.0.0.1:1234')
+parser.add_argument('--dist_backend',              type=str,   help='distributed backend', default='nccl')
+parser.add_argument('--gpu',                       type=int,   help='GPU id to use.', default=None)
+parser.add_argument('--multiprocessing_distributed',           help='Use multi-processing distributed training to launch '
+                                                                    'N processes per node, which has N GPUs. This is the '
+                                                                    'fastest way to use PyTorch for either single node or '
+                                                                    'multi node data parallel training', action='store_true',)
+# Online eval
+parser.add_argument('--do_online_eval',                        help='if set, perform online eval in every eval_freq steps', action='store_true')
+parser.add_argument('--data_path_eval',            type=str,   help='path to the data for online evaluation', required=False)
+parser.add_argument('--gt_path_eval',              type=str,   help='path to the groundtruth data for online evaluation', required=False)
+parser.add_argument('--filenames_file_eval',       type=str,   help='path to the filenames text file for online evaluation', required=False)
+parser.add_argument('--min_depth_eval',            type=float, help='minimum depth for evaluation', default=1e-3)
+parser.add_argument('--max_depth_eval',            type=float, help='maximum depth for evaluation', default=80)
+parser.add_argument('--eigen_crop',                            help='if set, crops according to Eigen NIPS14', action='store_true')
+parser.add_argument('--garg_crop',                             help='if set, crops according to Garg  ECCV16', action='store_true')
+parser.add_argument('--eval_freq',                 type=int,   help='Online evaluation frequency in global steps', default=500)
+parser.add_argument('--eval_summary_directory',    type=str,   help='output directory for eval summary,'
+                                                                    'if empty outputs to checkpoint folder', default='')
+if sys.argv.__len__() == 2:
+    arg_filename_with_prefix = '@' + sys.argv[1]
+    args = parser.parse_args([arg_filename_with_prefix])
+else:
+    args = parser.parse_args()
+if args.dataset == 'kitti' or args.dataset == 'nyu':
+    from dataloaders.dataloader import NewDataLoader
+def online_eval(model, dataloader_eval, gpu, epoch, ngpus, group, post_process=False):
+    eval_measures = torch.zeros(10).cuda(device=gpu)
+    for _, eval_sample_batched in enumerate(tqdm(dataloader_eval.data)):
+        with torch.no_grad():
+            image = torch.autograd.Variable(eval_sample_batched['image'].cuda(gpu, non_blocking=True))
+            gt_depth = eval_sample_batched['depth']
+            has_valid_depth = eval_sample_batched['has_valid_depth']
+            if not has_valid_depth:
+                # print('Invalid depth. continue.')
+                continue
+            pred_depths_r_list, _, _ = model(image)
+            if post_process:
+                image_flipped = flip_lr(image)
+                pred_depths_r_list_flipped, _, _ = model(image_flipped)
+                pred_depth = post_process_depth(pred_depths_r_list[-1], pred_depths_r_list_flipped[-1])
+            pred_depth = pred_depth.cpu().numpy().squeeze()
+            gt_depth = gt_depth.cpu().numpy().squeeze()
+        if args.do_kb_crop:
+            height, width = gt_depth.shape
+            top_margin = int(height - 352)
+            left_margin = int((width - 1216) / 2)
+            pred_depth_uncropped = np.zeros((height, width), dtype=np.float32)
+            pred_depth_uncropped[top_margin:top_margin + 352, left_margin:left_margin + 1216] = pred_depth
+            pred_depth = pred_depth_uncropped
+        pred_depth[pred_depth < args.min_depth_eval] = args.min_depth_eval
+        pred_depth[pred_depth > args.max_depth_eval] = args.max_depth_eval
+        pred_depth[np.isinf(pred_depth)] = args.max_depth_eval
+        pred_depth[np.isnan(pred_depth)] = args.min_depth_eval
+        valid_mask = np.logical_and(gt_depth > args.min_depth_eval, gt_depth < args.max_depth_eval)
+        if args.garg_crop or args.eigen_crop:
+            gt_height, gt_width = gt_depth.shape
+            eval_mask = np.zeros(valid_mask.shape)
+            if args.garg_crop:
+                eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height), int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
+            elif args.eigen_crop:
+                if args.dataset == 'kitti':
+                    eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height), int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
+                elif args.dataset == 'nyu':
+                    eval_mask[45:471, 41:601] = 1
+            valid_mask = np.logical_and(valid_mask, eval_mask)
+        measures = compute_errors(gt_depth[valid_mask], pred_depth[valid_mask])
+        eval_measures[:9] += torch.tensor(measures).cuda(device=gpu)
+        eval_measures[9] += 1
+    if args.multiprocessing_distributed:
+        # group = dist.new_group([i for i in range(ngpus)])
+        dist.all_reduce(tensor=eval_measures, op=dist.ReduceOp.SUM, group=group)
+    if not args.multiprocessing_distributed or gpu == 0:
+        eval_measures_cpu = eval_measures.cpu()
+        cnt = eval_measures_cpu[9].item()
+        eval_measures_cpu /= cnt
+        print('Computing errors for {} eval samples'.format(int(cnt)), ', post_process: ', post_process)
+        print("{:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}".format('silog', 'abs_rel', 'log10', 'rms',
+                                                                                     'sq_rel', 'log_rms', 'd1', 'd2',
+                                                                                     'd3'))
+        for i in range(8):
+            print('{:7.4f}, '.format(eval_measures_cpu[i]), end='')
+        print('{:7.4f}'.format(eval_measures_cpu[8]))
+        return eval_measures_cpu
+    return None
+def main_worker(gpu, ngpus_per_node, args):
+    args.gpu = gpu
+    if args.gpu is not None:
+        print("== Use GPU: {} for training".format(args.gpu))
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank)
+    # model
+    model = NewCRFDepth(version=args.encoder, inv_depth=False, max_depth=args.max_depth, pretrained=args.pretrain)
+    model.train()
+    num_params = sum([np.prod(p.size()) for p in model.parameters()])
+    print("== Total number of parameters: {}".format(num_params))
+    num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
+    print("== Total number of learning parameters: {}".format(num_params_update))
+    if args.distributed:
+        if args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model.cuda(args.gpu)
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+        else:
+            model.cuda()
+            model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
+    else:
+        model = torch.nn.DataParallel(model)
+        model.cuda()
+    if args.distributed:
+        print("== Model Initialized on GPU: {}".format(args.gpu))
+    else:
+        print("== Model Initialized")
+    global_step = 0
+    best_eval_measures_lower_better = torch.zeros(6).cpu() + 1e3
+    best_eval_measures_higher_better = torch.zeros(3).cpu()
+    best_eval_steps = np.zeros(9, dtype=np.int32)
+    # Training parameters
+    optimizer = torch.optim.Adam([{'params': model.module.parameters()}],
+                                lr=args.learning_rate)
+    model_just_loaded = False
+    if args.checkpoint_path != '':
+        if os.path.isfile(args.checkpoint_path):
+            print("== Loading checkpoint '{}'".format(args.checkpoint_path))
+            if args.gpu is None:
+                checkpoint = torch.load(args.checkpoint_path)
+            else:
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.checkpoint_path, map_location=loc)
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if not args.retrain:
+                try:
+                    global_step = checkpoint['global_step']
+                    best_eval_measures_higher_better = checkpoint['best_eval_measures_higher_better'].cpu()
+                    best_eval_measures_lower_better = checkpoint['best_eval_measures_lower_better'].cpu()
+                    best_eval_steps = checkpoint['best_eval_steps']
+                except KeyError:
+                    print("Could not load values for online evaluation")
+            print("== Loaded checkpoint '{}' (global_step {})".format(args.checkpoint_path, checkpoint['global_step']))
+        else:
+            print("== No checkpoint found at '{}'".format(args.checkpoint_path))
+        model_just_loaded = True
+        del checkpoint
+    cudnn.benchmark = True
+    dataloader = NewDataLoader(args, 'train')
+    dataloader_eval = NewDataLoader(args, 'online_eval')
+    # ===== Evaluation before training ======
+    # model.eval()
+    # with torch.no_grad():
+    #     eval_measures = online_eval(model, dataloader_eval, gpu, ngpus_per_node, post_process=True)
+    # Logging
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+        writer = SummaryWriter(args.log_directory + '/' + args.model_name + '/summaries', flush_secs=30)
+        if args.do_online_eval:
+            if args.eval_summary_directory != '':
+                eval_summary_path = os.path.join(args.eval_summary_directory, args.model_name)
+            else:
+                eval_summary_path = os.path.join(args.log_directory, args.model_name, 'eval')
+            eval_summary_writer = SummaryWriter(eval_summary_path, flush_secs=30)
+    silog_criterion = silog_loss(variance_focus=args.variance_focus)
+    sum_localdepth = Sum_depth().cuda(args.gpu)
+    start_time = time.time()
+    duration = 0
+    num_log_images = args.batch_size
+    end_learning_rate = args.end_learning_rate if args.end_learning_rate != -1 else 0.1 * args.learning_rate
+    var_sum = [var.sum().item() for var in model.parameters() if var.requires_grad]
+    var_cnt = len(var_sum)
+    var_sum = np.sum(var_sum)
+    print("== Initial variables' sum: {:.3f}, avg: {:.3f}".format(var_sum, var_sum/var_cnt))
+    steps_per_epoch = len(dataloader.data)
+    num_total_steps = args.num_epochs * steps_per_epoch
+    epoch = global_step // steps_per_epoch
+    group = dist.new_group([i for i in range(ngpus_per_node)])
+    while epoch < args.num_epochs:
+        if args.distributed:
+            dataloader.train_sampler.set_epoch(epoch)
+        for step, sample_batched in enumerate(dataloader.data):
+            optimizer.zero_grad()
+            before_op_time = time.time()
+            si_loss = 0
+            image = torch.autograd.Variable(sample_batched['image'].cuda(args.gpu, non_blocking=True))
+            depth_gt = torch.autograd.Variable(sample_batched['depth'].cuda(args.gpu, non_blocking=True))
+            pred_depths_r_list, pred_depths_c_list, uncertainty_maps_list = model(image, epoch, step)
+            if args.dataset == 'nyu':
+                mask = depth_gt > 0.1
+            else:
+                mask = depth_gt > 1.0
+            max_tree_depth = len(pred_depths_r_list)
+            for curr_tree_depth in range(max_tree_depth):
+                si_loss += silog_criterion.forward(pred_depths_r_list[curr_tree_depth], depth_gt, mask.to(torch.bool))
+            loss = si_loss
+            loss.backward()
+            for param_group in optimizer.param_groups:
+                current_lr = (args.learning_rate - end_learning_rate) * (1 - global_step / num_total_steps) ** 0.9 + end_learning_rate
+                param_group['lr'] = current_lr
+            optimizer.step()
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+                print('[epoch][s/s_per_e/gs]: [{}][{}/{}/{}], lr: {:.12f}, loss: {:.12f}'.format(epoch, step, steps_per_epoch, global_step, current_lr, loss))
+                # if np.isnan(loss.cpu().item()):
+                #     print('NaN in loss occurred. Aborting training.')
+                #     return -1
+            duration += time.time() - before_op_time
+            if global_step and global_step % args.log_freq == 0 and not model_just_loaded:
+                var_sum = [var.sum().item() for var in model.parameters() if var.requires_grad]
+                var_cnt = len(var_sum)
+                var_sum = np.sum(var_sum)
+                examples_per_sec = args.batch_size / duration * args.log_freq
+                duration = 0
+                time_sofar = (time.time() - start_time) / 3600
+                training_time_left = (num_total_steps / global_step - 1.0) * time_sofar
+                if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+                    print("{}".format(args.model_name))
+                print_string = 'GPU: {} | examples/s: {:4.2f} | loss: {:.5f} | var sum: {:.3f} avg: {:.3f} | time elapsed: {:.2f}h | time left: {:.2f}h'
+                print(print_string.format(args.gpu, examples_per_sec, loss, var_sum.item(), var_sum.item()/var_cnt, time_sofar, training_time_left))
+                if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                            and args.rank % ngpus_per_node == 0):
+                    writer.add_scalar('silog_loss', si_loss, global_step)
+                    # writer.add_scalar('var_loss', var_loss, global_step)
+                    writer.add_scalar('learning_rate', current_lr, global_step)
+                    writer.add_scalar('var average', var_sum.item()/var_cnt, global_step)
+                    depth_gt = torch.where(depth_gt < 1e-3, depth_gt * 0 + 1e-3, depth_gt)
+                    for i in range(num_log_images):
+                        if args.dataset == 'nyu':
+                            writer.add_image('depth_gt/image/{}'.format(i), colormap(depth_gt[i, :, :, :].data), global_step)
+                            writer.add_image('image/image/{}'.format(i), inv_normalize(image[i, :, :, :]).data, global_step)
+                            writer.add_image('depth_r_est0/image/{}'.format(i), colormap(pred_depths_r_list[0][i, :, :, :].data), global_step)
+                            writer.add_image('depth_r_est1/image/{}'.format(i), colormap(pred_depths_r_list[1][i, :, :, :].data), global_step)
+                            writer.add_image('depth_r_est2/image/{}'.format(i), colormap(pred_depths_r_list[2][i, :, :, :].data), global_step)
+                            writer.add_image('depth_r_est3/image/{}'.format(i), colormap(pred_depths_r_list[3][i, :, :, :].data), global_step)
+                            writer.add_image('depth_r_est4/image/{}'.format(i), colormap(pred_depths_r_list[4][i, :, :, :].data), global_step)
+                            writer.add_image('depth_r_est5/image/{}'.format(i), colormap(pred_depths_r_list[5][i, :, :, :].data), global_step)
+                            writer.add_image('depth_c_est0/image/{}'.format(i), colormap(pred_depths_c_list[0][i, :, :, :].data), global_step)
+                            writer.add_image('depth_c_est1/image/{}'.format(i), colormap(pred_depths_c_list[1][i, :, :, :].data), global_step)
+                            writer.add_image('depth_c_est2/image/{}'.format(i), colormap(pred_depths_c_list[2][i, :, :, :].data), global_step)
+                            writer.add_image('depth_c_est3/image/{}'.format(i), colormap(pred_depths_c_list[3][i, :, :, :].data), global_step)
+                            writer.add_image('depth_c_est4/image/{}'.format(i), colormap(pred_depths_c_list[4][i, :, :, :].data), global_step)
+                            writer.add_image('depth_c_est5/image/{}'.format(i), colormap(pred_depths_c_list[5][i, :, :, :].data), global_step)
+                        else:
+                            writer.add_image('depth_gt/image/{}'.format(i), colormap_magma(torch.log10(depth_gt[i, :, :, :].data)), global_step)
+                            writer.add_image('image/image/{}'.format(i), inv_normalize(image[i, :, :, :]).data, global_step)
+                            writer.add_image('depth_r_est0/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_r_list[0][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_r_est1/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_r_list[1][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_r_est2/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_r_list[2][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_r_est3/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_r_list[3][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_r_est4/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_r_list[4][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_r_est5/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_r_list[5][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_c_est0/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_c_list[0][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_c_est1/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_c_list[1][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_c_est2/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_c_list[2][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_c_est3/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_c_list[3][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_c_est4/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_c_list[4][i, :, :, :].data)), global_step)
+                            writer.add_image('depth_c_est5/image/{}'.format(i), colormap_magma(torch.log10(pred_depths_c_list[5][i, :, :, :].data)), global_step)
+                        writer.add_image('uncer_est0/image/{}'.format(i), colormap(uncertainty_maps_list[0][i, :, :, :].data), global_step)
+                        writer.add_image('uncer_est1/image/{}'.format(i), colormap(uncertainty_maps_list[1][i, :, :, :].data), global_step)
+                        writer.add_image('uncer_est2/image/{}'.format(i), colormap(uncertainty_maps_list[2][i, :, :, :].data), global_step)
+                        writer.add_image('uncer_est3/image/{}'.format(i), colormap(uncertainty_maps_list[3][i, :, :, :].data), global_step)
+                        writer.add_image('uncer_est4/image/{}'.format(i), colormap(uncertainty_maps_list[4][i, :, :, :].data), global_step)
+                        writer.add_image('uncer_est5/image/{}'.format(i), colormap(uncertainty_maps_list[5][i, :, :, :].data), global_step)
+            if args.do_online_eval and global_step and global_step % args.eval_freq == 0 and not model_just_loaded:
+                time.sleep(0.1)
+                model.eval()
+                with torch.no_grad():
+                    eval_measures = online_eval(model, dataloader_eval, gpu, epoch, ngpus_per_node, group, post_process=True)
+                if eval_measures is not None:
+                    exp_name = '%s'%(datetime.now().strftime('%m%d'))
+                    log_txt = os.path.join(args.log_directory + '/' + args.model_name, exp_name+'_logs.txt')
+                    with open(log_txt, 'a') as txtfile:
+                        txtfile.write(">>>>>>>>>>>>>>>>>>>>>>>>>Step:%d>>>>>>>>>>>>>>>>>>>>>>>>>\n"%(int(global_step)))
+                        txtfile.write("{:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}\n".format('silog',
+                                        'abs_rel', 'log10', 'rms', 'sq_rel', 'log_rms', 'd1', 'd2','d3'))
+                        txtfile.write("depth estimation\n")
+                        line = ''
+                        for i in range(9):
+                            line +='{:7.4f}, '.format(eval_measures[i])
+                        txtfile.write(line+'\n')
+                    for i in range(9):
+                        eval_summary_writer.add_scalar(eval_metrics[i], eval_measures[i].cpu(), int(global_step))
+                        measure = eval_measures[i]
+                        is_best = False
+                        if i < 6 and measure < best_eval_measures_lower_better[i]:
+                            old_best = best_eval_measures_lower_better[i].item()
+                            best_eval_measures_lower_better[i] = measure.item()
+                            is_best = True
+                        elif i >= 6 and measure > best_eval_measures_higher_better[i-6]:
+                            old_best = best_eval_measures_higher_better[i-6].item()
+                            best_eval_measures_higher_better[i-6] = measure.item()
+                            is_best = True
+                        if is_best:
+                            old_best_step = best_eval_steps[i]
+                            old_best_name = '/model-{}-best_{}_{:.5f}'.format(old_best_step, eval_metrics[i], old_best)
+                            model_path = args.log_directory + '/' + args.model_name + old_best_name
+                            if os.path.exists(model_path):
+                                command = 'rm {}'.format(model_path)
+                                os.system(command)
+                            best_eval_steps[i] = global_step
+                            model_save_name = '/model-{}-best_{}_{:.5f}'.format(global_step, eval_metrics[i], measure)
+                            print('New best for {}. Saving model: {}'.format(eval_metrics[i], model_save_name))
+                            checkpoint = {'global_step': global_step,
+                                          'model': model.state_dict(),
+                                          'optimizer': optimizer.state_dict(),
+                                          'best_eval_measures_higher_better': best_eval_measures_higher_better,
+                                          'best_eval_measures_lower_better': best_eval_measures_lower_better,
+                                          'best_eval_steps': best_eval_steps
+                                          }
+                            torch.save(checkpoint, args.log_directory + '/' + args.model_name + model_save_name)
+                    eval_summary_writer.flush()
+                model.train()
+                block_print()
+                enable_print()
+            model_just_loaded = False
+            global_step += 1
+        epoch += 1
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+        writer.close()
+        if args.do_online_eval:
+            eval_summary_writer.close()
+def main():
+    if args.mode != 'train':
+        print('train.py is only for training.')
+        return -1
+    exp_name = '%s'%(datetime.now().strftime('%m%d'))
+    args.log_directory = os.path.join(args.log_directory,exp_name)
+    command = 'mkdir ' + os.path.join(args.log_directory, args.model_name)
+    os.system(command)
+    args_out_path = os.path.join(args.log_directory, args.model_name)
+    command = 'cp ' + sys.argv[1] + ' ' + args_out_path
+    os.system(command)
+    save_files = True
+    if save_files:
+        aux_out_path = os.path.join(args.log_directory, args.model_name)
+        networks_savepath = os.path.join(aux_out_path, 'networks')
+        dataloaders_savepath = os.path.join(aux_out_path, 'dataloaders')
+        command = 'cp iebins/train.py ' + aux_out_path
+        os.system(command)
+        command = 'mkdir -p ' + networks_savepath + ' && cp iebins/networks/*.py ' + networks_savepath
+        os.system(command)
+        command = 'mkdir -p ' + dataloaders_savepath + ' && cp iebins/dataloaders/*.py ' + dataloaders_savepath
+        os.system(command)
+    torch.cuda.empty_cache()
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+    ngpus_per_node = torch.cuda.device_count()
+    if ngpus_per_node > 1 and not args.multiprocessing_distributed:
+        print("This machine has more than 1 gpu. Please specify --multiprocessing_distributed, or set \'CUDA_VISIBLE_DEVICES=0\'")
+        return -1
+    if args.do_online_eval:
+        print("You have specified --do_online_eval.")
+        print("This will evaluate the model every eval_freq {} steps and save best models for individual eval metrics."
+              .format(args.eval_freq))
+    if args.multiprocessing_distributed:
+        args.world_size = ngpus_per_node * args.world_size
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        main_worker(args.gpu, ngpus_per_node, args)
+if __name__ == '__main__':
+    main()

iebins/utils.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import Sampler
+from torchvision import transforms
+import matplotlib.pyplot as plt
+import os, sys
+import numpy as np
+import math
+import torch
+def convert_arg_line_to_args(arg_line):
+    for arg in arg_line.split():
+        if not arg.strip():
+            continue
+        yield arg
+def block_print():
+    sys.stdout = open(os.devnull, 'w')
+def enable_print():
+    sys.stdout = sys.__stdout__
+def get_num_lines(file_path):
+    f = open(file_path, 'r')
+    lines = f.readlines()
+    f.close()
+    return len(lines)
+def colorize(value, vmin=None, vmax=None, cmap='Greys'):
+    value = value.cpu().numpy()[:, :, :]
+    value = np.log10(value)
+    vmin = value.min() if vmin is None else vmin
+    vmax = value.max() if vmax is None else vmax
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)
+    else:
+        value = value*0.
+    cmapper = matplotlib.cm.get_cmap(cmap)
+    value = cmapper(value, bytes=True)
+    img = value[:, :, :3]
+    return img.transpose((2, 0, 1))
+def normalize_result(value, vmin=None, vmax=None):
+    value = value.cpu().numpy()[0, :, :]
+    vmin = value.min() if vmin is None else vmin
+    vmax = value.max() if vmax is None else vmax
+    if vmin != vmax:
+        value = (value - vmin) / (vmax - vmin)
+    else:
+        value = value * 0.
+    return np.expand_dims(value, 0)
+inv_normalize = transforms.Normalize(
+    mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225],
+    std=[1/0.229, 1/0.224, 1/0.225]
+)
+eval_metrics = ['silog', 'abs_rel', 'log10', 'rms', 'sq_rel', 'log_rms', 'd1', 'd2', 'd3']
+def compute_errors(gt, pred):
+    thresh = np.maximum((gt / pred), (pred / gt))
+    d1 = (thresh < 1.25).mean()
+    d2 = (thresh < 1.25 ** 2).mean()
+    d3 = (thresh < 1.25 ** 3).mean()
+    rms = (gt - pred) ** 2
+    rms = np.sqrt(rms.mean())
+    log_rms = (np.log(gt) - np.log(pred)) ** 2
+    log_rms = np.sqrt(log_rms.mean())
+    abs_rel = np.mean(np.abs(gt - pred) / gt)
+    sq_rel = np.mean(((gt - pred) ** 2) / gt)
+    err = np.log(pred) - np.log(gt)
+    silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100
+    err = np.abs(np.log10(pred) - np.log10(gt))
+    log10 = np.mean(err)
+    return [silog, abs_rel, log10, rms, sq_rel, log_rms, d1, d2, d3]
+class silog_loss(nn.Module):
+    def __init__(self, variance_focus):
+        super(silog_loss, self).__init__()
+        self.variance_focus = variance_focus
+    def forward(self, depth_est, depth_gt, mask):
+        d = torch.log(depth_est[mask]) - torch.log(depth_gt[mask])
+        return torch.sqrt((d ** 2).mean() - self.variance_focus * (d.mean() ** 2)) * 10.0
+def entropy_loss(preds, gt_label, mask):
+    # preds: B, C, H, W
+    # gt_label: B, H, W
+    # mask: B, H, W
+    mask = mask > 0.0 # B, H, W
+    preds = preds.permute(0, 2, 3, 1) # B, H, W, C
+    preds_mask = preds[mask] # N, C
+    gt_label_mask = gt_label[mask] # N
+    loss = F.cross_entropy(preds_mask, gt_label_mask, reduction='mean')
+    return loss
+def colormap(inputs, normalize=True, torch_transpose=True):
+    if isinstance(inputs, torch.Tensor):
+        inputs = inputs.detach().cpu().numpy()
+    _DEPTH_COLORMAP = plt.get_cmap('jet', 256)  # for plotting
+    vis = inputs
+    if normalize:
+        ma = float(vis.max())
+        mi = float(vis.min())
+        d = ma - mi if ma != mi else 1e5
+        vis = (vis - mi) / d
+    if vis.ndim == 4:
+        vis = vis.transpose([0, 2, 3, 1])
+        vis = _DEPTH_COLORMAP(vis)
+        vis = vis[:, :, :, 0, :3]
+        if torch_transpose:
+            vis = vis.transpose(0, 3, 1, 2)
+    elif vis.ndim == 3:
+        vis = _DEPTH_COLORMAP(vis)
+        vis = vis[:, :, :, :3]
+        if torch_transpose:
+            vis = vis.transpose(0, 3, 1, 2)
+    elif vis.ndim == 2:
+        vis = _DEPTH_COLORMAP(vis)
+        vis = vis[..., :3]
+        if torch_transpose:
+            vis = vis.transpose(2, 0, 1)
+    return vis[0,:,:,:]
+def colormap_magma(inputs, normalize=True, torch_transpose=True):
+    if isinstance(inputs, torch.Tensor):
+        inputs = inputs.detach().cpu().numpy()
+    _DEPTH_COLORMAP = plt.get_cmap('magma', 256)  # for plotting
+    vis = inputs
+    if normalize:
+        ma = float(vis.max())
+        mi = float(vis.min())
+        d = ma - mi if ma != mi else 1e5
+        vis = (vis - mi) / d
+    if vis.ndim == 4:
+        vis = vis.transpose([0, 2, 3, 1])
+        vis = _DEPTH_COLORMAP(vis)
+        vis = vis[:, :, :, 0, :3]
+        if torch_transpose:
+            vis = vis.transpose(0, 3, 1, 2)
+    elif vis.ndim == 3:
+        vis = _DEPTH_COLORMAP(vis)
+        vis = vis[:, :, :, :3]
+        if torch_transpose:
+            vis = vis.transpose(0, 3, 1, 2)
+    elif vis.ndim == 2:
+        vis = _DEPTH_COLORMAP(vis)
+        vis = vis[..., :3]
+        if torch_transpose:
+            vis = vis.transpose(2, 0, 1)
+    return vis[0,:,:,:]
+def flip_lr(image):
+    """
+    Flip image horizontally
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Image to be flipped
+    Returns
+    -------
+    image_flipped : torch.Tensor [B,3,H,W]
+        Flipped image
+    """
+    assert image.dim() == 4, 'You need to provide a [B,C,H,W] image to flip'
+    return torch.flip(image, [3])
+def fuse_inv_depth(inv_depth, inv_depth_hat, method='mean'):
+    """
+    Fuse inverse depth and flipped inverse depth maps
+    Parameters
+    ----------
+    inv_depth : torch.Tensor [B,1,H,W]
+        Inverse depth map
+    inv_depth_hat : torch.Tensor [B,1,H,W]
+        Flipped inverse depth map produced from a flipped image
+    method : str
+        Method that will be used to fuse the inverse depth maps
+    Returns
+    -------
+    fused_inv_depth : torch.Tensor [B,1,H,W]
+        Fused inverse depth map
+    """
+    if method == 'mean':
+        return 0.5 * (inv_depth + inv_depth_hat)
+    elif method == 'max':
+        return torch.max(inv_depth, inv_depth_hat)
+    elif method == 'min':
+        return torch.min(inv_depth, inv_depth_hat)
+    else:
+        raise ValueError('Unknown post-process method {}'.format(method))
+def post_process_depth(depth, depth_flipped, method='mean'):
+    """
+    Post-process an inverse and flipped inverse depth map
+    Parameters
+    ----------
+    inv_depth : torch.Tensor [B,1,H,W]
+        Inverse depth map
+    inv_depth_flipped : torch.Tensor [B,1,H,W]
+        Inverse depth map produced from a flipped image
+    method : str
+        Method that will be used to fuse the inverse depth maps
+    Returns
+    -------
+    inv_depth_pp : torch.Tensor [B,1,H,W]
+        Post-processed inverse depth map
+    """
+    B, C, H, W = depth.shape
+    inv_depth_hat = flip_lr(depth_flipped)
+    inv_depth_fused = fuse_inv_depth(depth, inv_depth_hat, method=method)
+    xs = torch.linspace(0., 1., W, device=depth.device,
+                        dtype=depth.dtype).repeat(B, C, H, 1)
+    mask = 1.0 - torch.clamp(20. * (xs - 0.05), 0., 1.)
+    mask_hat = flip_lr(mask)
+    return mask_hat * depth + mask * inv_depth_hat + \
+           (1.0 - mask - mask_hat) * inv_depth_fused
+class DistributedSamplerNoEvenlyDivisible(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        shuffle (optional): If true (default), sampler will shuffle the indices
+    """
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        num_samples = int(math.floor(len(self.dataset) * 1.0 / self.num_replicas))
+        rest = len(self.dataset) - num_samples * self.num_replicas
+        if self.rank < rest:
+            num_samples += 1
+        self.num_samples = num_samples
+        self.total_size = len(dataset)
+        # self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+        # add extra samples to make it evenly divisible
+        # indices += indices[:(self.total_size - len(indices))]
+        # assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        self.num_samples = len(indices)
+        # assert len(indices) == self.num_samples
+        return iter(indices)
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+class D_to_cloud(nn.Module):
+    """Layer to transform depth into point cloud
+    """
+    def __init__(self, batch_size, height, width):
+        super(D_to_cloud, self).__init__()
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        meshgrid = np.meshgrid(range(self.width), range(self.height), indexing='xy')
+        self.id_coords = np.stack(meshgrid, axis=0).astype(np.float32) # 2, H, W
+        self.id_coords = nn.Parameter(torch.from_numpy(self.id_coords), requires_grad=False) # 2, H, W
+        self.ones = nn.Parameter(torch.ones(self.batch_size, 1, self.height * self.width),
+                                 requires_grad=False) # B, 1, H, W
+        self.pix_coords = torch.unsqueeze(torch.stack(
+            [self.id_coords[0].view(-1), self.id_coords[1].view(-1)], 0), 0) # 1, 2, L
+        self.pix_coords = self.pix_coords.repeat(batch_size, 1, 1) # B, 2, L
+        self.pix_coords = nn.Parameter(torch.cat([self.pix_coords, self.ones], 1), requires_grad=False) # B, 3, L
+    def forward(self, depth, inv_K):
+        cam_points = torch.matmul(inv_K[:, :3, :3], self.pix_coords)
+        cam_points = depth.view(self.batch_size, 1, -1) * cam_points
+        return cam_points.permute(0, 2, 1)

iebins/utils/transfrom.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import random
+from PIL import Image, ImageOps, ImageFilter
+import torch
+from torchvision import transforms
+import torch.nn.functional as F
+import numpy as np
+import cv2
+import math
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {
+                             self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width,
+                                      height), interpolation=cv2.INTER_NEAREST
+                )
+            if "semseg_mask" in sample:
+                # sample["semseg_mask"] = cv2.resize(
+                #     sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
+                # )
+                sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[
+                                                      None, None, ...], (height, width), mode='nearest').numpy()[0, 0]
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(
+                    sample["mask"].astype(np.float32),
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+                # sample["mask"] = sample["mask"].astype(bool)
+        # print(sample['image'].shape, sample['depth'].shape)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "semseg_mask" in sample:
+            sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
+            sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
+        return sample

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pytorch=1.10.0
+torchvision
+cudatoolkit=11.1
+matplotlib
+tqdm
+tensorboardX
+timm
+mmcv
+open3d
+gradio_imageslider
+torch
+opencv-python