|
| 1 | +#include "decode_heic.h" |
| 2 | + |
| 3 | +#if HEIC_FOUND |
| 4 | +#include "libheif/heif_cxx.h" |
| 5 | +#endif // HEIC_FOUND |
| 6 | + |
| 7 | +namespace vision { |
| 8 | +namespace image { |
| 9 | + |
| 10 | +#if !HEIC_FOUND |
| 11 | +torch::Tensor decode_heic( |
| 12 | + const torch::Tensor& encoded_data, |
| 13 | + ImageReadMode mode) { |
| 14 | + TORCH_CHECK( |
| 15 | + false, "decode_heic: torchvision not compiled with libheif support"); |
| 16 | +} |
| 17 | +#else |
| 18 | + |
| 19 | +torch::Tensor decode_heic( |
| 20 | + const torch::Tensor& encoded_data, |
| 21 | + ImageReadMode mode) { |
| 22 | + TORCH_CHECK(encoded_data.is_contiguous(), "Input tensor must be contiguous."); |
| 23 | + TORCH_CHECK( |
| 24 | + encoded_data.dtype() == torch::kU8, |
| 25 | + "Input tensor must have uint8 data type, got ", |
| 26 | + encoded_data.dtype()); |
| 27 | + TORCH_CHECK( |
| 28 | + encoded_data.dim() == 1, |
| 29 | + "Input tensor must be 1-dimensional, got ", |
| 30 | + encoded_data.dim(), |
| 31 | + " dims."); |
| 32 | + |
| 33 | + if (mode != IMAGE_READ_MODE_UNCHANGED && mode != IMAGE_READ_MODE_RGB && |
| 34 | + mode != IMAGE_READ_MODE_RGB_ALPHA) { |
| 35 | + // Other modes aren't supported, but we don't error or even warn because we |
| 36 | + // have generic entry points like decode_image which may support all modes, |
| 37 | + // it just depends on the underlying decoder. |
| 38 | + mode = IMAGE_READ_MODE_UNCHANGED; |
| 39 | + } |
| 40 | + |
| 41 | + // If return_rgb is false it means we return rgba - nothing else. |
| 42 | + auto return_rgb = true; |
| 43 | + |
| 44 | + int height = 0; |
| 45 | + int width = 0; |
| 46 | + int num_channels = 0; |
| 47 | + int stride = 0; |
| 48 | + uint8_t* decoded_data = nullptr; |
| 49 | + heif::Image img; |
| 50 | + int bit_depth = 0; |
| 51 | + |
| 52 | + try { |
| 53 | + heif::Context ctx; |
| 54 | + ctx.read_from_memory_without_copy( |
| 55 | + encoded_data.data_ptr<uint8_t>(), encoded_data.numel()); |
| 56 | + |
| 57 | + // TODO properly error on (or support) image sequences. Right now, I think |
| 58 | + // this function will always return the first image in a sequence, which is |
| 59 | + // inconsistent with decode_gif (which returns a batch) and with decode_avif |
| 60 | + // (which errors loudly). |
| 61 | + // Why? I'm struggling to make sense of |
| 62 | + // ctx.get_number_of_top_level_images(). It disagrees with libavif's |
| 63 | + // imageCount. For example on some of the libavif test images: |
| 64 | + // |
| 65 | + // - colors-animated-12bpc-keyframes-0-2-3.avif |
| 66 | + // avif num images = 5 |
| 67 | + // heif num images = 1 // Why is this 1 when clearly this is supposed to |
| 68 | + // be a sequence? |
| 69 | + // - sofa_grid1x5_420.avif |
| 70 | + // avif num images = 1 |
| 71 | + // heif num images = 6 // If we were to error here we won't be able to |
| 72 | + // decode this image which is otherwise properly |
| 73 | + // decoded by libavif. |
| 74 | + // I can't find a libheif function that does what we need here, or at least |
| 75 | + // that agrees with libavif. |
| 76 | + |
| 77 | + // TORCH_CHECK( |
| 78 | + // ctx.get_number_of_top_level_images() == 1, |
| 79 | + // "heic file contains more than one image"); |
| 80 | + |
| 81 | + heif::ImageHandle handle = ctx.get_primary_image_handle(); |
| 82 | + bit_depth = handle.get_luma_bits_per_pixel(); |
| 83 | + |
| 84 | + return_rgb = |
| 85 | + (mode == IMAGE_READ_MODE_RGB || |
| 86 | + (mode == IMAGE_READ_MODE_UNCHANGED && !handle.has_alpha_channel())); |
| 87 | + |
| 88 | + height = handle.get_height(); |
| 89 | + width = handle.get_width(); |
| 90 | + |
| 91 | + num_channels = return_rgb ? 3 : 4; |
| 92 | + heif_chroma chroma; |
| 93 | + if (bit_depth == 8) { |
| 94 | + chroma = return_rgb ? heif_chroma_interleaved_RGB |
| 95 | + : heif_chroma_interleaved_RGBA; |
| 96 | + } else { |
| 97 | + // TODO: This, along with our 10bits -> 16bits range mapping down below, |
| 98 | + // may not work on BE platforms |
| 99 | + chroma = return_rgb ? heif_chroma_interleaved_RRGGBB_LE |
| 100 | + : heif_chroma_interleaved_RRGGBBAA_LE; |
| 101 | + } |
| 102 | + |
| 103 | + img = handle.decode_image(heif_colorspace_RGB, chroma); |
| 104 | + |
| 105 | + decoded_data = img.get_plane(heif_channel_interleaved, &stride); |
| 106 | + } catch (const heif::Error& err) { |
| 107 | + // We need this try/catch block and call TORCH_CHECK, because libheif may |
| 108 | + // otherwise throw heif::Error that would just be reported as "An unknown |
| 109 | + // exception occurred" when we move back to Python. |
| 110 | + TORCH_CHECK(false, "decode_heif failed: ", err.get_message()); |
| 111 | + } |
| 112 | + TORCH_CHECK(decoded_data != nullptr, "Something went wrong during decoding."); |
| 113 | + |
| 114 | + auto dtype = (bit_depth == 8) ? torch::kUInt8 : at::kUInt16; |
| 115 | + auto out = torch::empty({height, width, num_channels}, dtype); |
| 116 | + uint8_t* out_ptr = (uint8_t*)out.data_ptr(); |
| 117 | + |
| 118 | + // decoded_data is *almost* the raw decoded data, but not quite: for some |
| 119 | + // images, there may be some padding at the end of each row, i.e. when stride |
| 120 | + // != row_size_in_bytes. So we can't copy decoded_data into the tensor's |
| 121 | + // memory directly, we have to copy row by row. Oh, and if you think you can |
| 122 | + // take a shortcut when stride == row_size_in_bytes and just do: |
| 123 | + // out = torch::from_blob(decoded_data, ...) |
| 124 | + // you can't, because decoded_data is owned by the heif::Image object and it |
| 125 | + // gets freed when it gets out of scope! |
| 126 | + auto row_size_in_bytes = width * num_channels * ((bit_depth == 8) ? 1 : 2); |
| 127 | + for (auto h = 0; h < height; h++) { |
| 128 | + memcpy( |
| 129 | + out_ptr + h * row_size_in_bytes, |
| 130 | + decoded_data + h * stride, |
| 131 | + row_size_in_bytes); |
| 132 | + } |
| 133 | + if (bit_depth > 8) { |
| 134 | + // Say bit depth is 10. decodec_data and out_ptr contain 10bits values |
| 135 | + // over 2 bytes, stored into uint16_t. In torchvision a uint16 value is |
| 136 | + // expected to be in [0, 2**16), so we have to map the 10bits value to that |
| 137 | + // range. Note that other libraries like libavif do that mapping |
| 138 | + // automatically. |
| 139 | + // TODO: It's possible to avoid the memcpy call above in this case, and do |
| 140 | + // the copy at the same time as the conversation. Whether it's worth it |
| 141 | + // should be benchmarked. |
| 142 | + auto out_ptr_16 = (uint16_t*)out_ptr; |
| 143 | + for (auto p = 0; p < height * width * num_channels; p++) { |
| 144 | + out_ptr_16[p] <<= (16 - bit_depth); |
| 145 | + } |
| 146 | + } |
| 147 | + return out.permute({2, 0, 1}); |
| 148 | +} |
| 149 | +#endif // HEIC_FOUND |
| 150 | + |
| 151 | +} // namespace image |
| 152 | +} // namespace vision |
0 commit comments