Initial Extractor commit

2021-02-22 18:43:55 +01:00 · 2021-02-22 18:43:55 +01:00 · 9aa96dcbb4
parent 4d03de479b
commit 9aa96dcbb4
13 changed files with 3251 additions and 2 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,9 @@
+[submodule "nokia-dumper"]
+	path = nokia-dumper
+	url = https://github.com/stze/Home-Brew_Tool-Nokia.git
+[submodule "splituapp"]
+	path = splituapp
+	url = https://github.com/stze/splituapp.git
+[submodule "sinextract"]
+	path = sinextract
+	url = https://github.com/stze/anyxperia_dumper.git
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+FROM ubuntu:20.04
+ENV DEBIAN_FRONTEND=noninteractive
+COPY . /extractor/
+RUN apt-get update
+#Install tzdata in non-interactive mode, otherwise it asks for timezones.
+RUN apt-get install -y --no-install-recommends tzdata
+RUN apt-get install -y python3 python3-pip swig
+RUN apt-get install -y git android-sdk-libsparse-utils liblz4-tool brotli unrar
+RUN apt-get install -y zip rsync
+RUN cd /extractor && pip3 install -r requirements.txt
+ENTRYPOINT ["/extractor/extractor.py"]
--- a/README.md
+++ b/README.md
@ -1,2 +1,67 @@
-# extractor
-Extractor: The Android firmware image extraction tool
+# Extractor &middot; [![GitHub license](https://img.shields.io/badge/license-Apache%202.0-blue)](#LICENSE)
+
+<p align="center">
+  <img src="/docs/media/ext.png">
+</p>
+
+Extractor is a powerful Android firmware image extraction utility
+
+# Installation
+To run Extractor on your computer some preparation steps are required. Since Extractor is a python tool, a working python environment is required. Extractor depends on some git submodules, all of which can be initialized like so
+
+```bash
+# Initialize git submodules
+./scripts/init.sh
+```
+
+If you wish to run Extractor without installing the necesarry requirements yourself, you may run it using docker.
+
+## Debian-based (Debian, Ubuntu)
+
+Currently supports Debian 10 and Ubuntu 20.04. Use a terminal shell to execute the following commands:
+
+```bash
+sudo apt update
+# Install dependencies
+sudo apt install -y git android-sdk-libsparse-utils liblz4-tool brotli unrar
+```
+
+We recommend using a python virtualenv for installing Extractors python dependencies:
+
+```bash
+# Create virtualenv in venv directory
+python3 -m venv venv
+# Activate virtualenv
+source venv/bin/activate
+```
+
+Now, install the python dependencies:
+
+```bash
+pip3 install -r requirements.txt
+```
+# Usage
+
+You can run Extractor on your machine by running:
+
+```bash
+sudo ./extractor.py <firmware image> --system-dir-output <output directory>
+```
+
+This will extract a firmware image into a specified output directory. Extractor also supports saving the output in a tar archive:
+
+```bash
+sudo ./extractor.py <firmware image> --tar-output
+```
+
+Note: root privileges are required due to temporarily active loopback mount operations
+
+## Docker
+
+```bash
+./extract-docker.py --in-file <firmware image> --out-dir <output directory>
+```
+
+## License
+
+Extractor is [Apache 2.0 licensed](LICENSE).
--- a/construct_typing.py
+++ b/construct_typing.py
@ -0,0 +1,104 @@
+# This file is part of Extractor.
+
+# Copyright (C) 2021 Security Research Labs GmbH
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TypeVar, Type
+from io import BytesIO
+from construct import Struct, Construct  # type: ignore
+
+
+# TypeVar is required so that parse returns the right type (of the sub-class).
+# https://stackoverflow.com/a/46064289
+# noinspection PyTypeChecker
+T = TypeVar('T', bound='TypedContainer')
+
+
+class TypedContainer:
+    """
+    Base class for a typed struct for use with construct. Usage instructions:
+    * Make your own class with TypedContainer as superclass
+    * Define instance fields with typing (e.g. bytes or int)
+    * Set the class variable construct_struct to the actual construct Struct() definition
+    """
+    construct_struct: Struct
+
+    @classmethod
+    def parse(cls: Type[T], buf: bytes) -> T:
+        """
+        Parses a buffer
+        :param buf:
+        :return:
+        """
+        return cls.parse_stream(BytesIO(buf))
+
+    @classmethod
+    def parse_stream(cls: Type[T], stream):
+        self = cls()
+        construct_container = cls.construct_struct.parse_stream(stream)
+        for k, v in dict(construct_container).items():
+            self.__setattr__(k, v)
+        return self
+
+    @classmethod
+    def sizeof(cls):
+        return cls.construct_struct.sizeof()
+
+    def build(self) -> bytes:
+        return self.__class__.construct_struct.build(self.__dict__)
+
+    def __str__(self):
+        string_list = [self.__class__.__name__]
+        for k, v in sorted(self.__dict__.items()):
+            string_list.append("    %s = %r" % (k, v))
+        return "\n".join(string_list)
+
+    def __repr__(self):
+        field_params = []
+        # Use correct order
+        for field in self.construct_struct.subcons:
+            field_params.append("%s=%r" % (field.name, self.__getattribute__(field.name)))
+        return "%s(%s)" % (self.__class__.__name__, ", ".join(field_params))
+
+    def __eq__(self, other: T):
+        if type(self) is not type(other):
+            return False
+        for field in self.construct_struct.subcons:
+            if self.__getattribute__(field.name) != other.__getattribute__(field.name):
+                return False
+        return True
+
+    @classmethod
+    def as_inner_type(cls):
+        return InnerTypedContainer(cls)
+
+
+class InnerTypedContainer(Construct):
+    inner_type: T
+
+    def __init__(self, inner_type):
+        super().__init__()
+        self.inner_type = inner_type
+
+    def _parse(self, stream, context, path):
+        return self.inner_type.parse_stream(stream)
+
+    def _build(self, obj, stream, context, path):
+        buf = obj.build()
+        stream.write(buf)
+
+    def _sizeof(self, context, path):
+        return self.inner_type.sizeof()
--- a/docs/media/ext.png
+++ b/docs/media/ext.png
--- a/erofs_tool.py
+++ b/erofs_tool.py
@ -0,0 +1,625 @@
+#!/usr/bin/env python3
+
+# This file is part of Extractor.
+
+# Copyright (C) 2021 Security Research Labs GmbH
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import mmap
+import os
+from construct import Struct, Int32ul, Int16ul, Int8ul, Int64ul, Array, Union
+from enum import Enum
+from typing import List, Set
+import subprocess
+from io import BytesIO
+import math
+import sys
+from stat import S_IFLNK, S_IFDIR, S_IFREG, S_IFMT
+
+
+# Parser for Huawei EROFS filesystem, used on some new models.
+# Supported by Linux Kernel 4.19 and later
+# drivers/staging/erofs
+# Filesystem generation tool at https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs-utils.git/
+
+def main():
+    parser = argparse.ArgumentParser(description='EROFS filesystem extractor')
+    sp = parser.add_subparsers()
+    p = sp.add_parser("debug", help="Run debug code")
+    p.set_defaults(target=command_debug)
+    p = sp.add_parser("check", help="Check a given filesystem")
+    p.add_argument("fn", help="EROFS image file")
+    p.set_defaults(target=command_check)
+    p = sp.add_parser("file", help="Check a given filesystem")
+    p.add_argument("fn", help="EROFS image file")
+    p.add_argument("path", help="Path within erofs")
+    p.add_argument("--verify", help="Path to verify file")
+    p.add_argument("--extract", help="Path to save extracted file")
+    p.set_defaults(target=command_file)
+    p = sp.add_parser("extract", help="Extract erofs to directory")
+    p.add_argument("erofs_image", help="Path to erofs image")
+    p.add_argument("output_dir", help="Output directory")
+    p.add_argument("--verify-zip", action="store_true", help="Run test on all zip/apk/jar files to ensure that extraction works correctly")
+    p.set_defaults(target=command_extract)
+    args = parser.parse_args()
+    if hasattr(args, "target"):
+        args.target(args)
+    else:
+        parser.print_help()
+
+
+def command_debug(_args):
+    pass
+
+
+def command_extract(args):
+    try:
+        os.mkdir(args.output_dir)
+    except FileExistsError:
+        assert os.path.isdir(args.output_dir), "Output %r is not a directory" % args.output_dir
+        assert len(os.listdir(args.output_dir)) == 0, "Output directory %r is not empty: %r" % (args.output_dir, os.listdir(args.output_dir))
+    erofs = Erofs(args.erofs_image)
+    erofs.root_inode.extract(args.output_dir.encode(), verify_zip=args.verify_zip)
+
+
+def command_check(args):
+    erofs = Erofs(args.fn)
+    erofs.root_inode.traverse()
+
+
+def command_file(args):
+    erofs = Erofs(args.fn)
+    file_inode = erofs.get_file(args.path.encode())
+    data = file_inode.get_data(debug=True)
+    if args.verify is not None:
+        verify_buf = open(args.verify, 'rb').read()
+        assert len(data) == len(verify_buf), "Verify length mismatch: %r <=> %r" % (len(data), len(verify_buf))
+        for i in range(len(data)):
+            assert data[i] == verify_buf[i], "Mismatch at 0x%x: %r <=> %r" % (i, data[i], verify_buf[i])
+        print("File verified OK")
+    if args.extract is not None:
+        with open(args.extract, 'wb') as f:
+            f.write(data)
+
+
+# noinspection PyUnresolvedReferences
+struct_erofs_super = Struct(
+    "magic" / Int32ul,
+    "checksum" / Int32ul,
+    "features" / Int32ul,
+    "blkszbits" / Int8ul,
+    "reserved" / Int8ul,
+    "root_nid" / Int16ul,
+    "inos" / Int64ul,
+    "build_time" / Int64ul,
+    "build_time_nsec" / Int32ul,
+    "blocks" / Int32ul,
+    "meta_blkaddr" / Int32ul,
+    "xattr_blkaddr" / Int32ul,
+    "uuid" / Array(16, Int8ul),
+    "volume_name" / Array(16, Int8ul),
+    "reserved2" / Array(48, Int8ul)
+)
+assert struct_erofs_super.sizeof() == 128, struct_erofs_super.sizeof()
+
+
+class DataMappingMode(Enum):
+    EROFS_INODE_FLAT_PLAIN = 0
+    EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1
+    EROFS_INODE_FLAT_INLINE = 2
+    EROFS_INODE_FLAT_COMPRESSION = 3
+    EROFS_INODE_LAYOUT_MAX = 4
+
+
+# noinspection PyUnresolvedReferences
+struct_erofs_inode_v1 = Struct(
+    "i_advise" / Int16ul,
+    "i_xattr_icount" / Int16ul,
+    "i_mode" / Int16ul,
+    "i_nlink" / Int16ul,
+    "i_size" / Int32ul,
+    "i_reserved" / Int32ul,
+    "i_u" / Int32ul,
+    "i_ino" / Int32ul,
+    "i_uid" / Int16ul,
+    "i_gid" / Int16ul,
+    "checksum" / Int32ul,
+)
+assert struct_erofs_inode_v1.sizeof() == 32, struct_erofs_inode_v1.sizeof()
+
+
+class FileType(Enum):
+    EROFS_FT_UNKNOWN = 0
+    EROFS_FT_REG_FILE = 1
+    EROFS_FT_DIR = 2
+    EROFS_FT_CHRDEV = 3
+    EROFS_FT_BLKDEV = 4
+    EROFS_FT_FIFO = 5
+    EROFS_FT_SOCK = 6
+    EROFS_FT_SYMLINK = 7
+    EROFS_FT_MAX = 8
+
+
+# noinspection PyUnresolvedReferences
+struct_erofs_dirent = Struct(
+    "nid" / Int64ul,
+    "nameoff" / Int16ul,
+    "file_type" / Int8ul,
+    "reserved" / Int8ul
+)
+assert struct_erofs_dirent.sizeof() == 12, struct_erofs_dirent.sizeof()
+
+
+class DecompressIndexType(Enum):
+    Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0
+    Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1
+    Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2
+    Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3
+
+
+# noinspection PyUnresolvedReferences
+struct_z_erofs_vle_decompressed_index = Struct(
+    "di_advise" / Int16ul,
+    "di_clusterofs" / Int16ul,
+    "di_u" / Union(0,
+                   "blkaddr" / Int32ul,
+                   "delta" / Struct("delta0" / Int16ul, "delta1" / Int16ul)
+                   )
+)
+assert struct_z_erofs_vle_decompressed_index.sizeof() == 8
+
+
+# noinspection PyUnresolvedReferences
+struct_z_erofs_map_header = Struct(
+    "h_reserved1" / Int32ul,
+    "h_advise" / Int16ul,
+    "h_algorithmtype" / Int8ul,
+    "h_clusterbits" / Int8ul
+)
+
+
+class Erofs:
+    def __init__(self, fn: str):
+        self.fn = fn
+        self.file_handle = open(fn, 'rb')
+        self.file_size = os.fstat(self.file_handle.fileno()).st_size
+        self.mmap = mmap.mmap(self.file_handle.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ)
+        self.super = struct_erofs_super.parse(self.mmap[0x400:0x400+struct_erofs_super.sizeof()])
+        print("0x%08x-0x%08x: SUPER" % (0x400, 0x400 + struct_erofs_super.sizeof()))
+        assert self.super.magic == 0xe0f5e1e2, "0x%x" % self.super.magic
+        assert self.super.blkszbits == 12
+        print("root_nid=%r" % self.super.root_nid)
+        # print("super:\n%s" % self.super)
+        self.root_inode = self.get_inode(self.super.root_nid, FileType.EROFS_FT_DIR)
+        print("0x%08x-0x%08x: ROOT Inode" % (self.root_inode.inode_off, self.root_inode.inode_off + struct_erofs_inode_v1.sizeof()))
+        # print("root:\n%s" % self.root_inode)
+        # self.root_inode.traverse()
+
+    def get_inode(self, nid: int, file_type: FileType):
+        if file_type == FileType.EROFS_FT_DIR:
+            return DirInode(self, nid)
+        elif file_type == FileType.EROFS_FT_SYMLINK:
+            return SymlinkInode(self, nid)
+        elif file_type == FileType.EROFS_FT_REG_FILE:
+            return RegFileInode(self, nid)
+        else:
+            raise ValueError("inode type %r not supported" % file_type)
+
+    def get_inode_header(self, nid) -> struct_erofs_inode_v1:
+        inode_off = self.super.meta_blkaddr * 4096 + 32 * nid
+        if inode_off + struct_erofs_inode_v1.sizeof() > self.file_size:
+            raise ValueError("Inode nid 0x016%x out of range" % nid)
+        inode_buf = self.mmap[inode_off:inode_off + struct_erofs_inode_v1.sizeof()]
+        return struct_erofs_inode_v1.parse(inode_buf)
+
+    def get_file(self, path: bytes) -> "Inode":
+        path = path.split(b'/')
+        path = [x for x in path if x != b'']
+        inode: DirInode = self.root_inode
+        for i in range(len(path)):
+            path_elem = path[i]
+            ok = False
+            for dirent in inode.get_dirents():
+                if dirent.filename == path_elem:
+                    if i == len(path) - 1:
+                        return self.get_inode(dirent.nid, dirent.file_type)
+                    else:
+                        next_inode = self.get_inode(dirent.nid, dirent.file_type)
+                        if isinstance(inode, DirInode):
+                            inode = next_inode
+                            ok = True
+                        else:
+                            raise ValueError("Inode at %r is of type %r instead of DirInode" % (path[0:i], type(inode)))
+            if not ok:
+                raise FileNotFoundError("Failed to find %r in %r" % (path[i], path[0:i]))
+        assert False, path
+
+
+class Inode:
+    def __init__(self, erofs: Erofs, nid: int):
+        self.erofs = erofs
+        self.nid: int = nid
+        self.inode_off = erofs.super.meta_blkaddr * 4096 + 32 * nid
+        inode_buf = erofs.mmap[self.inode_off:self.inode_off + struct_erofs_inode_v1.sizeof()]
+        self.inode_header = struct_erofs_inode_v1.parse(inode_buf)
+        self.xattr_start_off = self.inode_off + struct_erofs_inode_v1.sizeof()
+        if self.inode_header.i_xattr_icount > 0:
+            self.xattr_size = 12 + (self.inode_header.i_xattr_icount - 1) * 4
+        else:
+            self.xattr_size = 0
+        self.data_mapping_mode = DataMappingMode(self.inode_header.i_advise >> 1)
+        assert self.inode_header.i_advise & 0x01 == 0
+
+    def get_data(self, debug=False) -> bytes:
+        if debug:
+            print("Inode(nid=%r).get_data(): data_mapping_mode=%s" % (self.nid, self.data_mapping_mode.name))
+            print("0x%08x-0x%08x: get_data Inode" % (self.inode_off, self.inode_off + struct_erofs_inode_v1.sizeof()))
+            print(self.inode_header)
+        if self.data_mapping_mode == DataMappingMode.EROFS_INODE_FLAT_INLINE:
+            # Last block of file is directly following the inode/xattr data
+            # Previous blocks are following this last block
+            last_block_data_off = self.xattr_start_off + self.xattr_size
+            last_block_data_size = 4096 - (last_block_data_off % 4096)
+            if last_block_data_size == 4096:
+                raise NotImplementedError("TODO: Check manually if there is a last block following the inode or not")
+            last_block_data = self.erofs.mmap[last_block_data_off: last_block_data_off + last_block_data_size]
+            if self.inode_header.i_size <= last_block_data_size:
+                return last_block_data[0:self.inode_header.i_size]
+            else:
+                # initial_blocks_data_off = last_block_data_off + last_block_data_size
+                # assert initial_blocks_data_off % 4096 == 0
+                initial_blocks_data_off = self.inode_header.i_u * 4096
+                initial_blocks_data_size = 4096 * math.ceil((self.inode_header.i_size - last_block_data_size) / 4096)
+                initial_blocks_data = self.erofs.mmap[initial_blocks_data_off:initial_blocks_data_off + initial_blocks_data_size]
+                assert len(initial_blocks_data) + len(last_block_data) >= self.inode_header.i_size
+                assert len(initial_blocks_data) + len(last_block_data) - self.inode_header.i_size < 4096
+                return (initial_blocks_data + last_block_data)[0:self.inode_header.i_size]
+        elif self.data_mapping_mode == DataMappingMode.EROFS_INODE_FLAT_COMPRESSION_LEGACY:
+            # print("HEADER: %s\n" % self.inode_header)
+            # i_u is number of compressed blocks for EROFS_INODE_LAYOUT_COMPRESSION
+            num_compressed_blocks = self.inode_header.i_u
+            if num_compressed_blocks > 30e3:
+                raise ValueError("Too may compressed blocks (self.inode_header.i_u=%r" % self.inode_header.i_u)
+            decompress_index_header_pos = self.xattr_start_off + self.xattr_size
+            # See Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size)
+            # round_up to a multiple of 8 bytes
+            if decompress_index_header_pos % 8 == 4:
+                decompress_index_header_pos += 4
+            assert decompress_index_header_pos % 8 == 0
+            decompress_index_header_pos += struct_z_erofs_map_header.sizeof()
+            decompress_index_header_pos += 8  # Z_EROFS_VLE_LEGACY_HEADER_PADDING
+            assert decompress_index_header_pos % 8 == 0
+            # assert decompress_index_header_pos == self.xattr_start_off + self.xattr_size + 20
+            # assert False
+            prev_clusterofs = 0
+            num_decompressed_blocks = math.ceil(self.inode_header.i_size / 4096)
+            with BytesIO() as out:
+                prev_blkaddr = 0
+                prev_reserved_blkaddr = 0
+                for di_number in range(num_decompressed_blocks):
+                    buf = self.erofs.mmap[decompress_index_header_pos + struct_z_erofs_vle_decompressed_index.sizeof() * di_number: decompress_index_header_pos + struct_z_erofs_vle_decompressed_index.sizeof() * (di_number + 1)]
+                    # print("  %s" % codecs.encode(buf, 'hex').decode())
+                    di = struct_z_erofs_vle_decompressed_index.parse(buf)
+                    if debug:
+                        print("DI %d/%d: adv=0x%04x %r" % (di_number, num_decompressed_blocks, di.di_advise, di))
+                        print("  OFF %r" % ((2**16 + di.di_clusterofs - prev_clusterofs) % 2**16))
+                    prev_clusterofs = di.di_clusterofs
+                    Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT = 0
+                    Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS = 2
+                    # See vle_legacy_load_cluster_from_disk() in drivers/staging/erofs/zmap.c
+                    type_int = (di.di_advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1)
+                    decompress_index_type = DecompressIndexType(type_int)
+                    # print("DI %r: %r" % (di_number, decompress_index_type))
+                    # print("OFFSET CHECK: %r <=> %r" % (out.tell() % 4096, di.di_clusterofs))
+                    if decompress_index_type == DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_RESERVED:
+                        if di.di_u.blkaddr == prev_blkaddr:
+                            decompress_index_type = DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD
+                        else:
+                            decompress_index_type = DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_HEAD
+                        prev_blkaddr = di.di_u.blkaddr
+                    if decompress_index_type == DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+                        out.seek(di_number * 4096 + di.di_clusterofs)
+                        assert out.tell() == di_number * 4096 + di.di_clusterofs
+                        blkaddr = di.di_u.blkaddr
+                        buf = self.erofs.mmap[4096 * blkaddr: 4096 * (blkaddr + 1)]
+                        if self.inode_header.i_size < out.tell() + len(buf):
+                            buf = buf[0:self.inode_header.i_size - out.tell()]
+                        out.write(buf)
+                    elif decompress_index_type == DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+                        if out.tell() % 4096 != di.di_clusterofs:
+                            if di.di_clusterofs == 0:
+                                out.seek(out.tell() - (out.tell() % 4096))
+                            else:
+                                raise ValueError("Cluster offset check failed: %r <=> %r" % (out.tell() % 4096, di.di_clusterofs))
+                        # assert out.tell() % 4096 == di.di_clusterofs, "Cluster offset check failed: %r <=> %r" % (out.tell() % 4096, di.di_clusterofs)
+                        blkaddr = di.di_u.blkaddr
+                        compressed_buf = self.erofs.mmap[4096 * blkaddr: 4096 * (blkaddr + 1)]
+                        # hd(compressed_buf)
+                        # decompressed_buf = pp_decompress_lz4(compressed_buf, maxlen=self.inode_header.i_size - out.tell(), expected=open("/usr/bin/lxc", "rb").read()[out.tell():])
+                        decompressed_buf = pp_decompress_lz4(compressed_buf, maxlen=self.inode_header.i_size - out.tell())
+                        out.write(decompressed_buf)
+                    elif decompress_index_type == DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+                        pass
+                    elif decompress_index_type == DecompressIndexType.Z_EROFS_VLE_CLUSTER_TYPE_RESERVED:
+                        blkaddr = di.di_u.blkaddr
+                        if blkaddr == prev_reserved_blkaddr:
+                            continue
+                        else:
+                            prev_reserved_blkaddr = blkaddr
+                        compressed_buf = self.erofs.mmap[4096 * blkaddr: 4096 * (blkaddr + 1)]
+                        # hd(compressed_buf)
+                        decompressed_buf = pp_decompress_lz4(compressed_buf, maxlen=self.inode_header.i_size - out.tell())
+                        print("len(decompressed_buf)=%r  decompressed_buf[0:50] = %r" % (len(decompressed_buf), decompressed_buf[0:50]))
+                        out.write(decompressed_buf)
+                    else:
+                        raise ValueError("Unexpected decompress_index_type %r" % decompress_index_type)
+                if self.inode_header.checksum != 0:
+                    raise NotImplementedError("Checksum verification not yet implemented")
+                if out.tell() == self.inode_header.i_size:
+                    return out.getvalue()
+                elif out.tell() > self.inode_header.i_size:
+                    return out.getvalue()[0:self.inode_header.i_size]
+                else:
+                    raise ValueError("Bad file size %r (expected: %r)" % (out.tell(), self.inode_header.i_size))
+        elif self.data_mapping_mode == DataMappingMode.EROFS_INODE_FLAT_PLAIN:
+            # print("HEADER: %s\n" % self.inode_header)
+            last_block_data_off = self.inode_header.i_u * 4096
+            data_size = self.inode_header.i_size
+            data = self.erofs.mmap[last_block_data_off:last_block_data_off+data_size]
+            # assert False
+            return data
+        elif self.data_mapping_mode == DataMappingMode.EROFS_INODE_FLAT_COMPRESSION:
+            raise NotImplementedError("TODO: Implement EROFS_INODE_FLAT_COMPRESSION")
+        else:
+            raise ValueError("Don't know how to get data for data_mapping_mode=%r" % self.data_mapping_mode)
+
+    def get_data_dir(self, debug=False) -> bytes:
+        """
+        Gets the directory data (struct erofs_dirent + filename buffer).
+        Separate function required since EROFS_INODE_FLAT_INLINE behaves differently for directories
+        and regular files
+        :param debug:
+        :return:
+        """
+        if debug:
+            print("Inode(nid=%r).get_data(): data_mapping_mode=%s" % (self.nid, self.data_mapping_mode.name))
+            print("0x%08x-0x%08x: get_data Inode" % (self.inode_off, self.inode_off + struct_erofs_inode_v1.sizeof()))
+            print(self.inode_header)
+        if self.data_mapping_mode == DataMappingMode.EROFS_INODE_FLAT_INLINE:
+            # For directories with EROFS_INODE_FLAT_INLINE, the full data is (sequentially) following the inode header/xattr.
+            data_off = self.xattr_start_off + self.xattr_size
+            data_size = self.inode_header.i_size
+            return self.erofs.mmap[data_off: data_off + data_size]
+        else:
+            # Other mdoes are equal for directories and file data
+            return self.get_data()
+
+
+class DirEnt:
+    def __init__(self, filename: bytes, file_type: FileType, nid: int):
+        self.filename: bytes = filename
+        self.file_type: FileType = file_type
+        self.nid: int = nid
+
+    def __repr__(self):
+        return "DirEnt(%r, %r, %r)" % (self.filename, self.file_type, self.nid)
+
+
+class DirInode(Inode):
+    def __init__(self, erofs: Erofs, nid: int):
+        super(DirInode, self).__init__(erofs, nid)
+        if S_IFMT(self.inode_header.i_mode) != S_IFDIR:
+            raise ValueError("DirInode at nid=0x%16x is not of type S_IFDIR, self.inode_header.i_mode=0x%08x" % (nid, self.inode_header.i_mode))
+        # print("self.inode_off=0x%x" % self.inode_off)
+        # print(self.inode_header)
+        data = self.get_data_dir()
+        self.dirents: List[DirEnt] = []
+        if len(data) == 0:
+            return
+        # hd(data)
+        dirent0 = struct_erofs_dirent.parse(data[0:12])
+        # print(dirent0)
+        # return
+        assert dirent0.nameoff % 12 == 0
+        num_dirents = int(dirent0.nameoff / 12)
+        struct_dirents: List[struct_erofs_dirent] = []
+        for i in range(num_dirents):
+            struct_dirents.append(struct_erofs_dirent.parse(data[12*i:12*i+12]))
+        self.dirents = []
+        filenames_done: Set[bytes] = set()
+        for i in range(num_dirents):
+            struct_dirent = struct_dirents[i]
+            name_end = len(data)
+            if i < num_dirents - 1:
+                name_end = struct_dirents[i+1].nameoff
+            filename = data[struct_dirent.nameoff:name_end]
+            filename = filename.split(b'\0', 1)[0]
+            if filename == b'':
+                raise ValueError("Empty filename")
+            if filename in filenames_done:
+                raise ValueError("Duplicate filename %r" % filename)
+            # print("FILE %r: %r" % (filename, struct_dirent))
+            assert len(filename) < 255, "Filename too long(%d bytes): %r..." % (len(filename), filename[0:50])
+            if struct_dirent.file_type >= FileType.EROFS_FT_MAX.value:
+                raise ValueError("Bad struct_dirent.file_type %r" % struct_dirent.file_type)
+            file_type = FileType(struct_dirent.file_type)
+            dirent = DirEnt(filename, file_type, struct_dirent.nid)
+            self.dirents.append(dirent)
+            # print("%r" % dirent)
+
+    def get_dirents(self) -> List[DirEnt]:
+        return self.dirents
+
+    def traverse(self, prefix=b"/"):
+        for dirent in self.dirents:
+            print("TRAVERSE: %r => %r" % (prefix, dirent.filename))
+            child_inode = self.erofs.get_inode(dirent.nid, dirent.file_type)
+            if dirent.file_type == FileType.EROFS_FT_SYMLINK:
+                print("%s%s: %r => %r" % (prefix.decode(errors="ignore"), dirent.filename.decode(errors="ignore"), dirent, child_inode.get_symlink_dest()))
+            elif dirent.file_type == FileType.EROFS_FT_REG_FILE:
+                print("%s%s: %r" % (prefix.decode(errors="ignore"), dirent.filename.decode(errors="ignore"), dirent))
+            elif dirent.file_type == FileType.EROFS_FT_DIR:
+                # Some versions of mkfs.erofs add entries for "." and ".."
+                if dirent.filename in (b'.', b'..'):
+                    continue
+                print("%s%s: %r" % (prefix.decode(errors="ignore"), dirent.filename.decode(errors="ignore"), dirent))
+                child_inode.traverse(prefix + dirent.filename + b'/')
+
+    def extract(self, output_dir: bytes, verify_zip: bool = False):
+        """
+        Extracts this directory to output_dir.
+        :param output_dir:
+        Must already exist (as an empty directory)
+        :param verify_zip:
+        Verify all zip/jar/apk files in output (using "unzip -tqq") to detect potential extraction errors
+        :return:
+        """
+        for dirent in self.dirents:
+            out_path = os.path.join(output_dir, dirent.filename)
+            print("Extracting %r" % out_path.decode())
+            if os.path.exists(out_path):
+                raise ValueError("Duplicate file %r" % out_path)
+            child_inode = self.erofs.get_inode(dirent.nid, dirent.file_type)
+            if dirent.file_type == FileType.EROFS_FT_SYMLINK:
+                os.symlink(child_inode.get_symlink_dest(), out_path)
+            elif dirent.file_type == FileType.EROFS_FT_DIR:
+                # Some versions of mkfs.erofs add entries for "." and ".."
+                if dirent.filename in (b'.', b'..'):
+                    continue
+                os.mkdir(out_path)
+                # Always make directories mode 755
+                os.chmod(out_path, 0o755)
+                child_inode.extract(out_path, verify_zip=verify_zip)
+            elif dirent.file_type == FileType.EROFS_FT_REG_FILE:
+                with open(out_path, 'wb') as f:
+                    f.write(child_inode.get_data())
+                # use original mode & 0o755 => Ignore setuid/setgid bit
+                mode = child_inode.inode_header.i_mode & 0o777
+                # Ensure files are always readable
+                mode |= 0o444
+                os.chmod(out_path, mode)
+                if verify_zip:
+                    ext = out_path.split(b'.')[-1].lower()
+                    if ext in (b'zip', b'jar', b'apk'):
+                        print("Verifying %r" % out_path)
+                        subprocess.check_call(["unzip", "-tqq", out_path])
+            else:
+                raise NotImplementedError("file_type %r not implemented" % dirent.file_type)
+
+
+class SymlinkInode(Inode):
+    def __init__(self, erofs: Erofs, nid: int):
+        super(SymlinkInode, self).__init__(erofs, nid)
+        if S_IFMT(self.inode_header.i_mode) != S_IFLNK:
+            raise ValueError("SymlinkInode at nid=0x%16x is not of type S_IFLNK, self.inode_header.i_mode=0x%08x" % (nid, self.inode_header.i_mode))
+        self.symlink_dest = self.get_data()
+
+    def get_symlink_dest(self):
+        return self.symlink_dest
+
+
+class RegFileInode(Inode):
+    def __init__(self, erofs: Erofs, nid: int):
+        super(RegFileInode, self).__init__(erofs, nid)
+        if S_IFMT(self.inode_header.i_mode) != S_IFREG:
+            raise ValueError("RegFileInode at nid=0x%16x is not of type S_IFREG, self.inode_header.i_mode=0x%08x" % (nid, self.inode_header.i_mode))
+
+
+def hd(buf: bytes):
+    sys.stdout.flush()
+    p = subprocess.Popen(["hd"], stdin=subprocess.PIPE)
+    p.stdin.write(buf)
+    p.stdin.close()
+    p.wait()
+
+
+def pp_decompress_lz4(buf: bytes, maxlen: int = None, expected: bytes = None) -> bytes:
+    """
+    https://github.com/lz4/lz4/blob/master/doc/lz4_Block_format.md
+    :param buf: Compressed buffer, raw LZ4 without framing or length header
+    :param maxlen: Maximum length to extract, will return buffer after extracting that amount of bytes
+    :param expected: Optional known decompressed value to debug extraction errors
+    :return:
+    """
+    with BytesIO() as out:
+        pos = 0
+        while pos < len(buf):
+            token_byte = buf[pos]
+            # print("Token 0x%02x at 0x%x" % (token_byte, pos))
+            pos += 1
+            # Get length of literal from input
+            literal_length = token_byte >> 4
+            if literal_length == 0xf:
+                length_byte = buf[pos]
+                pos += 1
+                literal_length += length_byte
+                while length_byte == 0xff:
+                    length_byte = buf[pos]
+                    pos += 1
+                    literal_length += length_byte
+            literal_buf = buf[pos: pos + literal_length]
+            pos += literal_length
+            if expected is not None:
+                for i in range(len(literal_buf)):
+                    assert literal_buf[i] == expected[out.tell() + i], "Mismatch at position 0x%x: %r <=> %r" % (out.tell() + i, literal_buf[i], expected[out.tell() + i])
+            out.write(literal_buf)
+            if maxlen is not None and out.tell() >= maxlen:
+                return out.getvalue()[0:maxlen]
+            if pos == len(buf) or pos == len(buf) - 1:
+                # Reached end of input after literal => OK
+                break
+            # print("OFFSET POS: 0x%x" % pos)
+            # Get offset for copy operation
+            offset = buf[pos] + 256 * buf[pos + 1]
+            pos += 2
+            if offset == 0:
+                continue
+                # raise ValueError("Offset cannot be 0")
+            # Get matchlength for copy operation
+            matchlength = token_byte & 0x0f
+            if matchlength == 0xf:
+                length_byte = buf[pos]
+                pos += 1
+                matchlength += length_byte
+                while length_byte == 0xff:
+                    length_byte = buf[pos]
+                    pos += 1
+                    matchlength += length_byte
+            matchlength += 4
+            match_pos = out.tell() - offset
+            while matchlength > 0:
+                copylen = min(matchlength, out.tell() - match_pos)
+                copybuf = out.getvalue()[match_pos: match_pos + copylen]
+                if expected is not None:
+                    for i in range(len(copybuf)):
+                        assert copybuf[i] == expected[out.tell() + i], "Mismatch at position %r" % (out.tell() + i)
+                out.write(copybuf)
+                if maxlen is not None and out.tell() >= maxlen:
+                    return out.getvalue()[0:maxlen]
+                matchlength -= copylen
+                # print("copylen=%r" % copylen)
+                # Copy from the original position => Copy as many bytes as possible at a time
+                assert copylen % offset == 0 or matchlength == 0
+                # match_pos += copylen % offset
+            # Old, un-optimized code:
+            # for i in range(matchlength):
+            #     out.write(out.getvalue()[match_pos + i:match_pos + i + 1])
+        return out.getvalue()
+
+
+if __name__ == "__main__":
+    main()
--- a/extract-docker.py
+++ b/extract-docker.py
@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# This file is part of Extractor.
+
+# Copyright (C) 2021 Security Research Labs GmbH
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+import time
+import argparse
+import pathlib
+import logging
+import subprocess
+
+
+def main():
+    parser = argparse.ArgumentParser("Extract using docker extractor image")
+
+    parser.add_argument("--in-file", type=lambda p: pathlib.Path(p).absolute(), required=True, help="Input file (e.g. Android image)")
+    parser.add_argument("--out-dir", type=lambda p: pathlib.Path(p).absolute(), required=True, help="Output directory")
+    parser.add_argument('--force-cleanup-and-rebuild', action='store_true')
+    args = parser.parse_args()
+
+    logging.basicConfig(format='%(levelname)s:%(asctime)s:%(message)s', level=logging.DEBUG)
+
+    # Abort if out dir does not exist or is non-empty
+    if not args.out_dir.is_dir():
+        logging.error("[!] %s not a directory, exiting", args.out_dir)
+        sys.exit(1)
+    if any(args.out_dir.iterdir()):
+        logging.error("[!] %s not empty, exiting", args.out_dir)
+        sys.exit(1)
+
+    start_time = time.time()
+    logging.info("[+] Check if docker image is up-to-date")
+    extractor_revision = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], cwd=pathlib.Path(__file__).absolute().parents[0]).strip().decode()
+    image_name = "extractor_image:" + extractor_revision
+    extractor_image_exists = False
+
+    # Check if some extractor_image exists (all versions), if not build
+    extractor_image_list = subprocess.check_output(["docker", "images", "-q", "extractor_image"], stderr=subprocess.DEVNULL).splitlines()
+
+    if not extractor_image_list:
+        logging.info("[+] Building docker image %s", image_name)
+        subprocess.check_output(["docker", "build", ".", "-t", image_name])
+    else:
+        # If extractor_image already exists, check if we want to force rebuild
+        if args.force_cleanup_and_rebuild:
+            # Delete all existing extractor_image images
+            for image in extractor_image_list:
+                subprocess.check_output(["docker", "rmi", image.decode()])
+            # Build new image
+            subprocess.check_output(["docker", "build", ".", "-t", image_name])
+        else:
+            # Stop in case we find multiple local images or an outdated image
+            if len(extractor_image_list) != 1:
+                logging.error("[!] Too many local extractor_images exist, please use --force-cleanup-and-rebuild to cleanup and rebuild")
+                sys.exit(1)
+            elif subprocess.check_output(["docker", "images", "-q", image_name], stderr=subprocess.DEVNULL).strip() not in extractor_image_list:
+                logging.error("[!] Your existing local image %s is outdated, please use --force-cleanup-and-rebuild to rebuild", extractor_image_list[0].decode())
+                sys.exit(1)
+
+    logging.info("[+] Running extractor with docker image %s", image_name)
+    subprocess.check_call([
+        "docker",
+        "run",
+        "--privileged",
+        "--mount",
+        "type=bind,src=" + str(args.in_file.parents[0]) + ",dst=/in_dir",
+        "--mount",
+        "type=bind,src=" + str(args.out_dir) + ",dst=/out_dir",
+        "--rm",
+        image_name,
+        "/in_dir/" + args.in_file.name,
+        "--system-dir-output",
+        "/out_dir/"
+    ])
+
+    duration = time.time() - start_time
+    logging.info("%s", f"[+] Output saved to {str(args.out_dir)} in {duration}s")
+
+if __name__ == "__main__":
+    main()
--- a/extractor.py
+++ b/extractor.py
--- a/init.sh
+++ b/init.sh
@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -e
+
+pushd () {
+	command pushd "$@" > /dev/null
+}
+
+popd () {
+	command popd "$@" > /dev/null
+}
+
+echo "*** Initializing Extractor build environment"
+
+# Init git submodule
+git submodule update --init
+
+PROJECT_ROOT=`git rev-parse --show-toplevel`
+
+# Save current directory.
+pushd .
+
+cd $PROJECT_ROOT/sinextract
+make
+
+# Restore initial directory.
+popd
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 4d0c8a027a137da6c5e4687bf70c5d0716fc84a9
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,19 @@
+certifi==2020.12.5
+cffi==1.14.3
+chardet==3.0.4
+construct==2.10.56
+crypto==1.4.1
+cryptography==3.2.1
+et-xmlfile==1.0.1
+idna==2.10
+jdcal==1.4.1
+lxml==4.6.2
+Naked==0.1.31
+protobuf==3.15.1
+pycryptodome==3.9.9
+pytlv==0.71
+PyYAML==5.3.1
+requests==2.25.0
+shellescape==3.8.1
+six==1.15.0
+urllib3==1.26.2
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 11d89e213312de6141028729e4f881d011a06227
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit ab9d3a3651b64bd4baec768771df119badfb5f6c
				`@ -0,0 +1 @@`
				`Subproject commit 4d0c8a027a137da6c5e4687bf70c5d0716fc84a9`
				`@ -0,0 +1 @@`
				`Subproject commit 11d89e213312de6141028729e4f881d011a06227`
				`@ -0,0 +1 @@`
				`Subproject commit ab9d3a3651b64bd4baec768771df119badfb5f6c`