class BaseTransform ( keys=None ) [source]

Base class of all transforms used in computer vision.

calling logic:

if keys is None:

_get_params -> _apply_image()


_get_params -> _apply_*() for * in keys

If you want to implement a self-defined transform method for image, rewrite _apply_* method in subclass.


keys (list[str]|tuple[str], optional) –

Input type. Input is a tuple contains different structures, key is used to specify the type of input. For example, if your input is image type, then the key can be None or (“image”). if your input is (image, image) type, then the keys should be (“image”, “image”). if your input is (image, boxes), then the keys should be (“image”, “boxes”).

Current available strings & data type are describe below:

  • ”image”: input image, with shape of (H, W, C)

  • ”coords”: coordinates, with shape of (N, 2)

  • ”boxes”: bounding boxes, with shape of (N, 4), “xyxy” format,

    the 1st “xy” represents top left point of a box, the 2nd “xy” represents right bottom point.

  • ”mask”: map used for segmentation, with shape of (H, W, 1)

You can also customize your data types only if you implement the corresponding _apply_*() methods, otherwise NotImplementedError will be raised.


import numpy as np
from PIL import Image
import as F
from import BaseTransform

def _get_image_size(img):
    if F._is_pil_image(img):
        return img.size
    elif F._is_numpy_image(img):
        return img.shape[:2][::-1]
        raise TypeError("Unexpected type {}".format(type(img)))

class CustomRandomFlip(BaseTransform):
    def __init__(self, prob=0.5, keys=None):
        super(CustomRandomFlip, self).__init__(keys)
        self.prob = prob

    def _get_params(self, inputs):
        image = inputs[self.keys.index('image')]
        params = {}
        params['flip'] = np.random.random() < self.prob
        params['size'] = _get_image_size(image)
        return params

    def _apply_image(self, image):
        if self.params['flip']:
            return F.hflip(image)
        return image

    # if you only want to transform image, do not need to rewrite this function
    def _apply_coords(self, coords):
        if self.params['flip']:
            w = self.params['size'][0]
            coords[:, 0] = w - coords[:, 0]
        return coords

    # if you only want to transform image, do not need to rewrite this function
    def _apply_boxes(self, boxes):
        idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
        coords = np.asarray(boxes).reshape(-1, 4)[:, idxs].reshape(-1, 2)
        coords = self._apply_coords(coords).reshape((-1, 4, 2))
        minxy = coords.min(axis=1)
        maxxy = coords.max(axis=1)
        trans_boxes = np.concatenate((minxy, maxxy), axis=1)
        return trans_boxes

    # if you only want to transform image, do not need to rewrite this function
    def _apply_mask(self, mask):
        if self.params['flip']:
            return F.hflip(mask)
        return mask

# create fake inputs
fake_img = Image.fromarray((np.random.rand(400, 500, 3) * 255.).astype('uint8'))
fake_boxes = np.array([[2, 3, 200, 300], [50, 60, 80, 100]])
fake_mask = fake_img.convert('L')

# only transform for image:
flip_transform = CustomRandomFlip(1.0)
converted_img = flip_transform(fake_img)

# transform for image, boxes and mask
flip_transform = CustomRandomFlip(1.0, keys=('image', 'boxes', 'mask'))
(converted_img, converted_boxes, converted_mask) = flip_transform((fake_img, fake_boxes, fake_mask))
print('converted boxes', converted_boxes)