diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..4c3da3c --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +CACHE_DIR=/tmp/artifactview +MAX_ARTIFACT_SIZE=100000000 +MAX_AGE_H=12 +# If you only want to access public repositories, +# create a fine-grained token with Public Repositories (read-only) access +GITHUB_TOKEN=github_pat_123456 diff --git a/.gitignore b/.gitignore index ea8c4bf..4f83806 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/.env diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..c77c173 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: end-of-file-fixer + + - repo: https://github.com/cathiele/pre-commit-rust + rev: v0.1.0 + hooks: + - id: cargo-fmt + - id: cargo-clippy + args: ["--all", "--tests", "--", "-D", "warnings"] diff --git a/Cargo.lock b/Cargo.lock index d885295..0af50cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,30 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -26,6 +50,80 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "annotate-snippets" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccaf7e9dfbb6ab22c82e473cd1a8a7bd313c19a5b7e40970f3d89ef5a5c9e81e" +dependencies = [ + "unicode-width", + "yansi-term", +] + +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" + +[[package]] +name = "anstyle-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + [[package]] name = "anyhow" version = "1.0.86" @@ -33,25 +131,99 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" [[package]] -name = "arc-swap" -version = "1.7.1" +name = "array-init" +version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" +checksum = "23589ecb866b460d3a0f1278834750268c607e8e28a1b982c907219f3178cd72" +dependencies = [ + "nodrop", +] [[package]] name = "artifactview" version = "0.1.0" dependencies = [ - "anyhow", - "arc-swap", + "async_zip", + "axum", + "axum-extra", + "dotenvy", + "envy", + "flate2", + "futures-lite", + "headers", + "hex", + "http", + "mime", + "mime_guess", "once_cell", + "path_macro", + "percent-encoding", + "pin-project", "proptest", + "quick_cache", + "rand", "regex", "reqwest", "rstest", "serde", + "serde-env", + "serde-hex", "serde_json", + "siphasher", + "thiserror", "tokio", + "tokio-util", + "tower-http", + "tracing", + "tracing-subscriber", + "url", + "yarte", + "yarte_helpers", +] + +[[package]] +name = "async-compression" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c90a406b4495d129f00461241616194cb8a032c8d1c53c657f0961d5f8e0498" +dependencies = [ + "bzip2", + "deflate64", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "xz2", + "zstd 0.13.1", + "zstd-safe 7.1.0", +] + +[[package]] +name = "async-trait" +version = "0.1.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + +[[package]] +name = "async_zip" +version = "0.0.17" +dependencies = [ + "async-compression", + "chrono", + "crc32fast", + "env_logger", + "futures-lite", + "pin-project", + "thiserror", + "tokio", + "tokio-util", + "zip", ] [[package]] @@ -66,6 +238,84 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +[[package]] +name = "axum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper 1.0.1", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15c63fd72d41492dc4f497196f5da1fb04fb7529e631d73630d1b491e47a2e3" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 0.1.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-extra" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0be6ea09c9b96cb5076af0de2e383bd2bc0c18f827cf1967bdd353e0b910d733" +dependencies = [ + "axum", + "axum-core", + "bytes", + "futures-util", + "headers", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "serde", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "backtrace" version = "0.3.71" @@ -81,12 +331,24 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bit-set" version = "0.5.3" @@ -114,23 +376,64 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] [[package]] name = "cfg-if" @@ -138,6 +441,46 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-targets 0.52.5", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "colorchoice" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "core-foundation" version = "0.9.4" @@ -154,6 +497,91 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "deflate64" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83ace6c86376be0b6cdcf3fb41882e81d94b31587573d1cfa9d01cd06bba210d" + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + +[[package]] +name = "dtoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" + [[package]] name = "encoding_rs" version = "0.8.34" @@ -163,6 +591,36 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_filter" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + +[[package]] +name = "envy" +version = "0.4.2" +dependencies = [ + "serde", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -185,6 +643,16 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "flate2" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -230,6 +698,25 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-lite" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52527eb5074e35e9339c6b4e8d12600c7128b68fb25dcb9fa9dec18f7c25f3a5" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-sink" version = "0.3.30" @@ -254,6 +741,16 @@ dependencies = [ "pin-utils", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -302,6 +799,51 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "headers" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9" +dependencies = [ + "base64 0.21.7", + "bytes", + "headers-core", + "http", + "httpdate", + "mime", + "sha1", +] + +[[package]] +name = "headers-core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4" +dependencies = [ + "http", +] + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "http" version = "1.1.0" @@ -342,6 +884,18 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "hyper" version = "1.3.1" @@ -355,9 +909,10 @@ dependencies = [ "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", - "smallvec", + "smallvec 1.13.2", "tokio", "want", ] @@ -398,6 +953,29 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "idna" version = "0.5.0" @@ -418,18 +996,42 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "inout" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itoa" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jobserver" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -463,12 +1065,45 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + [[package]] name = "memchr" version = "2.7.2" @@ -481,6 +1116,16 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "miniz_oxide" version = "0.7.3" @@ -519,6 +1164,28 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-traits" version = "0.2.19" @@ -529,6 +1196,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "object" version = "0.32.2" @@ -567,7 +1244,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.66", ] [[package]] @@ -588,6 +1265,70 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "parking" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec 1.13.2", + "windows-targets 0.52.5", +] + +[[package]] +name = "password-hash" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" +dependencies = [ + "base64ct", + "rand_core", + "subtle", +] + +[[package]] +name = "path_macro" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6e819bbd49d5939f682638fa54826bf1650abddcd65d000923de8ad63cc7d15" + +[[package]] +name = "pbkdf2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -611,7 +1352,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.66", ] [[package]] @@ -632,12 +1373,28 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "prettyplease" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" +dependencies = [ + "proc-macro2", + "syn 1.0.109", +] + [[package]] name = "proc-macro2" version = "1.0.84" @@ -673,6 +1430,18 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick_cache" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "347e1a588d1de074eeb3c00eadff93db4db65aeb62aee852b1efd0949fe65b6c" +dependencies = [ + "ahash", + "equivalent", + "hashbrown", + "parking_lot", +] + [[package]] name = "quote" version = "1.0.36" @@ -721,6 +1490,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "redox_syscall" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" +dependencies = [ + "bitflags 2.5.0", +] + [[package]] name = "regex" version = "1.10.4" @@ -762,7 +1540,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "encoding_rs", "futures-core", @@ -786,7 +1564,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "sync_wrapper", + "sync_wrapper 0.1.2", "system-configuration", "tokio", "tokio-native-tls", @@ -821,7 +1599,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn", + "syn 2.0.66", "unicode-ident", ] @@ -859,7 +1637,7 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" dependencies = [ - "base64", + "base64 0.22.1", "rustls-pki-types", ] @@ -869,6 +1647,12 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +[[package]] +name = "rustversion" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" + [[package]] name = "rusty-fork" version = "0.3.0" @@ -896,6 +1680,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "security-framework" version = "2.11.0" @@ -934,6 +1724,28 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-env" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c68119a0846249fd6f4b38561b4b4727dbc4fd9fea074f1253bca7d50440ce58" +dependencies = [ + "anyhow", + "log", + "serde", +] + +[[package]] +name = "serde-hex" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca37e3e4d1b39afd7ff11ee4e947efae85adfddf4841787bfa47c470e96dc26d" +dependencies = [ + "array-init", + "serde", + "smallvec 0.6.14", +] + [[package]] name = "serde_derive" version = "1.0.203" @@ -942,7 +1754,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.66", ] [[package]] @@ -956,6 +1768,16 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af99884400da37c88f5e9146b7f1fd0fbcae8f6eec4e9da38b67d05486f814a6" +dependencies = [ + "itoa", + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -968,6 +1790,52 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -977,6 +1845,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "smallvec" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97fcaeba89edba30f044a10c6a3cc39df9c3f17d7cd829dd1446cab35f890e0" +dependencies = [ + "maybe-uninit", +] + [[package]] name = "smallvec" version = "1.13.2" @@ -993,6 +1870,23 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.66" @@ -1010,6 +1904,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "system-configuration" version = "0.5.1" @@ -1043,6 +1943,55 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "thiserror" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "time" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde", + "time-core", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + [[package]] name = "tinyvec" version = "1.6.0" @@ -1068,7 +2017,10 @@ dependencies = [ "bytes", "libc", "mio", + "num_cpus", + "parking_lot", "pin-project-lite", + "signal-hook-registry", "socket2", "tokio-macros", "windows-sys 0.48.0", @@ -1082,7 +2034,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.66", ] [[package]] @@ -1103,11 +2055,21 @@ checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", + "futures-io", "futures-sink", "pin-project-lite", "tokio", ] +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + [[package]] name = "tower" version = "0.4.13" @@ -1121,6 +2083,24 @@ dependencies = [ "tokio", "tower-layer", "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" +dependencies = [ + "bitflags 2.5.0", + "bytes", + "http", + "http-body", + "http-body-util", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", ] [[package]] @@ -1141,10 +2121,23 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ + "log", "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "tracing-core" version = "0.1.32" @@ -1152,6 +2145,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +dependencies = [ + "nu-ansi-term", + "sharded-slab", + "smallvec 1.13.2", + "thread_local", + "tracing-core", + "tracing-log", ] [[package]] @@ -1160,12 +2179,27 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unarray" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.15" @@ -1187,6 +2221,18 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-width" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6" + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "url" version = "2.5.0" @@ -1198,12 +2244,46 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "v_eval" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd8b599d797eb038d0dde9a3860aacb6bbba3bffa4ac64f807c8673820cc9d9" +dependencies = [ + "regex", + "syn 1.0.109", +] + +[[package]] +name = "v_htmlescape" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e8257fbc510f0a46eb602c10215901938b5c2a7d5e70fc11483b1d3c9b5b18c" + +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "wait-timeout" version = "0.2.0" @@ -1249,7 +2329,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.66", "wasm-bindgen-shared", ] @@ -1283,7 +2363,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.66", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1304,6 +2384,37 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.5", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -1452,3 +2563,193 @@ dependencies = [ "cfg-if", "windows-sys 0.48.0", ] + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + +[[package]] +name = "yansi-term" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe5c30ade05e61656247b2e334a031dfd0cc466fadef865bdcdea8d537951bf1" +dependencies = [ + "winapi", +] + +[[package]] +name = "yarte" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfce1df93f3b16e5272221a559e60bbbaaa71dbc042a43996d223e51a690aab2" +dependencies = [ + "yarte_derive", + "yarte_helpers", +] + +[[package]] +name = "yarte_codegen" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a79312078b97a195de91a8c1457c2e0d7abd97e6e605f3cdeb01b3c105d2cff" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "yarte_helpers", + "yarte_hir", +] + +[[package]] +name = "yarte_derive" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b296edd7e1a81717b6f794baa2de8dfe89646050847161550b2d963b3ca6fe80" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "yarte_codegen", + "yarte_helpers", + "yarte_hir", + "yarte_parser", +] + +[[package]] +name = "yarte_helpers" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0d1076f8cee9541ea5ffbecd9102f751252c91f085e7d30a18a3ce805ebd3ee" +dependencies = [ + "dtoa", + "itoa", + "prettyplease", + "serde", + "syn 1.0.109", + "toml", + "v_htmlescape", +] + +[[package]] +name = "yarte_hir" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee42d2f704a3b1d8bc111d47a705d1302a0943d85e4c230f4e8300ee0dde4a6" +dependencies = [ + "derive_more", + "proc-macro2", + "quote", + "syn 1.0.109", + "v_eval", + "v_htmlescape", + "yarte_helpers", + "yarte_parser", +] + +[[package]] +name = "yarte_parser" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "538f72049cf7104e12d5c444048d112cb8fc788a31308afd912442a381ba860c" +dependencies = [ + "annotate-snippets", + "derive_more", + "proc-macro2", + "quote", + "serde", + "syn 1.0.109", + "unicode-xid", + "yarte_helpers", +] + +[[package]] +name = "zerocopy" +version = "0.7.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "aes", + "byteorder", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "flate2", + "hmac", + "pbkdf2", + "sha1", + "time", + "zstd 0.11.2+zstd.1.5.2", +] + +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe 5.0.2+zstd.1.5.2", +] + +[[package]] +name = "zstd" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +dependencies = [ + "zstd-safe 7.1.0", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-safe" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.10+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index c181bb9..1f8173d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,15 +4,47 @@ version = "0.1.0" edition = "2021" [dependencies] -anyhow = "1.0.86" -arc-swap = "1.7.1" +async_zip = { path = "crates/async_zip", features = ["tokio", "tokio-fs", "deflate"] } +axum = { version = "0.7.5", features = ["http2"] } +axum-extra = { version = "0.9.3", features = ["typed-header"] } +dotenvy = "0.15.7" +envy = { path = "crates/envy" } +flate2 = "1.0.30" +futures-lite = "2.3.0" +headers = "0.4.0" +hex = "0.4.3" +http = "1.1.0" +mime = "0.3.17" +mime_guess = "2.0.4" once_cell = "1.19.0" +path_macro = "1.0.0" +percent-encoding = "2.3.1" +pin-project = "1.1.5" +quick_cache = "0.5.1" +rand = "0.8.5" regex = "1.10.4" reqwest = { version = "0.12.4", features = ["json"] } serde = { version = "1.0.203", features = ["derive"] } +serde-env = "0.1.1" +serde-hex = "0.1.0" serde_json = "1.0.117" -tokio = {version = "1.37.0", features = ["macros"]} +siphasher = "1.0.1" +thiserror = "1.0.61" +tokio = { version = "1.37.0", features = ["macros", "fs", "rt-multi-thread"] } +tokio-util = { version = "0.7.11", features = ["io"] } +tower-http = { version = "0.5.2", features = ["trace"] } +tracing = "0.1.40" +tracing-subscriber = "0.3.18" +url = "2.5.0" +yarte = "0.15.7" + +[build-dependencies] +yarte_helpers = "0.15.8" [dev-dependencies] proptest = "1.4.0" rstest = { version = "0.19.0", default-features = false } + +[workspace] +members = [".", "crates/*"] +resolver = "2" diff --git a/Justfile b/Justfile new file mode 100644 index 0000000..7e99822 --- /dev/null +++ b/Justfile @@ -0,0 +1,30 @@ +test: + cargo test + +release: + #!/usr/bin/env bash + set -e + + CRATE="artifactview" + CHANGELOG="CHANGELOG.md" + + VERSION=$(cargo pkgid --package "$CRATE" | tr '#@' '\n' | tail -n 1) + TAG="v${VERSION}" + echo "Releasing $TAG:" + + if git rev-parse "$TAG" >/dev/null 2>&1; then echo "version tag $TAG already exists"; exit 1; fi + + CLIFF_ARGS="--tag '${TAG}' --unreleased" + echo "git-cliff $CLIFF_ARGS" + if [ -f "$CHANGELOG" ]; then + eval "git-cliff $CLIFF_ARGS --prepend '$CHANGELOG'" + else + eval "git-cliff $CLIFF_ARGS --output '$CHANGELOG'" + fi + + git add "$CHANGELOG" + git commit -m "chore(release): release $CRATE v$VERSION" + + awk 'BEGIN{RS="(^|\n)## [^\n]+\n*"} NR==2 { print }' "$CHANGELOG" | git tag -as -F - --cleanup whitespace "$TAG" + + echo "🚀 Run 'git push origin $TAG' to publish" diff --git a/README.md b/README.md index 69cc74b..6d078a3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# artifactview +# Artifactview View CI build artifacts from Forgejo/Github using your web browser. @@ -20,4 +20,21 @@ status code 404 if no file was found. Artifactview accepts URLs in the given format: `-------.example.com` -Example: `github-com--theta-dev--example-project--4-11.example.com` +Example: `https://github-com--theta-dev--example-project--4-11.example.com` + +## Security considerations + +It is recommended to use the whitelist feature to limit Artifactview to access only trusted +servers, users and organizations. + +Since many +[well-known URIs](https://www.iana.org/assignments/well-known-uris/well-known-uris.xhtml) +are used to configure security-relevant properties of a website or are used to attest +ownership of a website (like `.well-known/acme-challenge` for issuing TLS certificates), +Artifactview will serve no files from the `.well-known` folder. + +There is a configurable limit for both the maximum downloaded artifact size and the +maximum size of individual files to be served (100MB by default). +Additionally there is a configurable timeout for the zip file indexing operation. +These measures should protect the server againt denial-of-service attacks like +overfilling the server drive or uploading zip bombs. diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..7745f71 --- /dev/null +++ b/build.rs @@ -0,0 +1,3 @@ +fn main() { + yarte_helpers::recompile::when_changed(); +} diff --git a/crates/async_zip/.cargo-ok b/crates/async_zip/.cargo-ok new file mode 100644 index 0000000..59cae28 --- /dev/null +++ b/crates/async_zip/.cargo-ok @@ -0,0 +1 @@ +{"v":1} diff --git a/crates/async_zip/.cargo_vcs_info.json b/crates/async_zip/.cargo_vcs_info.json new file mode 100644 index 0000000..5f86548 --- /dev/null +++ b/crates/async_zip/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "e4ee7a521f624aea3c2c3eef6b78fb1ec057504b" + }, + "path_in_vcs": "" +} diff --git a/crates/async_zip/.github/dependabot.yml b/crates/async_zip/.github/dependabot.yml new file mode 100644 index 0000000..60ab683 --- /dev/null +++ b/crates/async_zip/.github/dependabot.yml @@ -0,0 +1,12 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + # Workflow files stored in the + # default location of `.github/workflows` + directory: "/" + schedule: + interval: "daily" + - package-ecosystem: "cargo" + directory: "/" + schedule: + interval: "daily" diff --git a/crates/async_zip/.github/workflows/ci-clippy.yml b/crates/async_zip/.github/workflows/ci-clippy.yml new file mode 100644 index 0000000..d3b12e7 --- /dev/null +++ b/crates/async_zip/.github/workflows/ci-clippy.yml @@ -0,0 +1,20 @@ +name: clippy (Linux) + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Run clippy + run: cargo clippy --all-features -- -D clippy::all diff --git a/crates/async_zip/.github/workflows/ci-fmt.yml b/crates/async_zip/.github/workflows/ci-fmt.yml new file mode 100644 index 0000000..ea6d7e4 --- /dev/null +++ b/crates/async_zip/.github/workflows/ci-fmt.yml @@ -0,0 +1,20 @@ +name: rustfmt (Linux) + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Run rustfmt + run: cargo fmt --check diff --git a/crates/async_zip/.github/workflows/ci-linux.yml b/crates/async_zip/.github/workflows/ci-linux.yml new file mode 100644 index 0000000..6d81998 --- /dev/null +++ b/crates/async_zip/.github/workflows/ci-linux.yml @@ -0,0 +1,51 @@ +name: Test (Linux) + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Test [no features] + run: cargo test --verbose + + - name: Test ['chrono' feature] + run: cargo test --verbose --features chrono + + - name: Test ['tokio' feature] + run: cargo test --verbose --features tokio + + - name: Test ['tokio-fs' feature] + run: cargo test --verbose --features tokio-fs + + - name: Test ['deflate' feature] + run: cargo test --verbose --features deflate + + - name: Test ['bzip2' feature] + run: cargo test --verbose --features bzip2 + + - name: Test ['lzma' feature] + run: cargo test --verbose --features lzma + + - name: Test ['zstd' feature] + run: cargo test --verbose --features zstd + + - name: Test ['xz' feature] + run: cargo test --verbose --features xz + + - name: Test ['deflate64' feature] + run: cargo test --verbose --features deflate64 + + - name: Test ['full' feature] + run: cargo test --verbose --features full diff --git a/crates/async_zip/.github/workflows/ci-typos.yml b/crates/async_zip/.github/workflows/ci-typos.yml new file mode 100644 index 0000000..9e60d51 --- /dev/null +++ b/crates/async_zip/.github/workflows/ci-typos.yml @@ -0,0 +1,24 @@ +name: typos (Linux) + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install typos + run: cargo install typos-cli + + - name: Run typos + run: typos --format brief diff --git a/crates/async_zip/.github/workflows/ci-wasm.yml b/crates/async_zip/.github/workflows/ci-wasm.yml new file mode 100644 index 0000000..3214a73 --- /dev/null +++ b/crates/async_zip/.github/workflows/ci-wasm.yml @@ -0,0 +1,24 @@ +name: Build (WASM) + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + name: Build ['full-wasm' feature] on ${{ matrix.target }} + runs-on: ubuntu-latest + strategy: + matrix: + target: + - wasm32-wasi + - wasm32-unknown-unknown + steps: + - uses: actions/checkout@v4 + - run: rustup target add ${{ matrix.target }} + - run: cargo build --verbose --target ${{ matrix.target }} --features full-wasm diff --git a/crates/async_zip/.gitignore b/crates/async_zip/.gitignore new file mode 100644 index 0000000..a08f02c --- /dev/null +++ b/crates/async_zip/.gitignore @@ -0,0 +1,15 @@ +# Generated by Cargo +# will have compiled files and executables +/target/ +/examples/**/target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +/Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk +/examples/**/*.rs.bk + +# Ignore generated zip test file that is large +/src/tests/read/zip64/zip64many.zip diff --git a/crates/async_zip/Cargo.toml b/crates/async_zip/Cargo.toml new file mode 100644 index 0000000..2f32d76 --- /dev/null +++ b/crates/async_zip/Cargo.toml @@ -0,0 +1,63 @@ +[package] +name = "async_zip" +version = "0.0.17" +edition = "2021" +authors = ["Harry [hello@majored.pw]"] +repository = "https://github.com/Majored/rs-async-zip" +description = "An asynchronous ZIP archive reading/writing crate." +readme = "README.md" +license = "MIT" +documentation = "https://docs.rs/async_zip/" +homepage = "https://github.com/Majored/rs-async-zip" +keywords = ["async", "zip", "archive", "tokio"] +categories = ["asynchronous", "compression"] + +[features] +full = ["chrono", "tokio-fs", "deflate", "bzip2", "lzma", "zstd", "xz", "deflate64"] + +# All features that are compatible with WASM +full-wasm = ["chrono", "deflate", "zstd"] + +tokio = ["dep:tokio", "tokio-util", "tokio/io-util"] +tokio-fs = ["tokio/fs"] + +deflate = ["async-compression/deflate"] +bzip2 = ["async-compression/bzip2"] +lzma = ["async-compression/lzma"] +zstd = ["async-compression/zstd"] +xz = ["async-compression/xz"] +deflate64 = ["async-compression/deflate64"] + +[package.metadata.docs.rs] +all-features = true +# defines the configuration attribute `docsrs` +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +crc32fast = "1" +futures-lite = { version = "2.1.0", default-features = false, features = ["std"] } +pin-project = "1" +thiserror = "1" + +async-compression = { version = "0.4.2", default-features = false, features = ["futures-io"], optional = true } +chrono = { version = "0.4", default-features = false, features = ["clock"], optional = true } +tokio = { version = "1", default-features = false, optional = true } +tokio-util = { version = "0.7", features = ["compat"], optional = true } + +[dev-dependencies] +# tests +tokio = { version = "1", features = ["full"] } +tokio-util = { version = "0.7", features = ["compat"] } +env_logger = "0.11.2" +zip = "0.6.3" + +# shared across multiple examples +# anyhow = "1" +# sanitize-filename = "0.5" + +# actix_multipart +# actix-web = "4" +# actix-multipart = "0.6" +# futures = "0.3" +# derive_more = "0.99" +# uuid = { version = "1", features = ["v4", "serde"] } diff --git a/crates/async_zip/LICENSE b/crates/async_zip/LICENSE new file mode 100644 index 0000000..ea2b727 --- /dev/null +++ b/crates/async_zip/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2021 Harry +Copyright (c) 2023 Cognite AS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/async_zip/README.md b/crates/async_zip/README.md new file mode 100644 index 0000000..6e6a32a --- /dev/null +++ b/crates/async_zip/README.md @@ -0,0 +1,81 @@ +# async_zip +[![Crates.io](https://img.shields.io/crates/v/async_zip?style=flat-square)](https://crates.io/crates/async_zip) +[![Crates.io](https://img.shields.io/crates/d/async_zip?style=flat-square)](https://crates.io/crates/async_zip) +[![docs.rs](https://img.shields.io/docsrs/async_zip?style=flat-square)](https://docs.rs/async_zip/) +[![GitHub Workflow Status (branch)](https://img.shields.io/github/actions/workflow/status/Majored/rs-async-zip/ci-linux.yml?branch=main&style=flat-square)](https://github.com/Majored/rs-async-zip/actions?query=branch%3Amain) +[![GitHub](https://img.shields.io/github/license/Majored/rs-async-zip?style=flat-square)](https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +An asynchronous ZIP archive reading/writing crate. + +## Features +- A base implementation atop `futures`'s IO traits. +- An extended implementation atop `tokio`'s IO traits. +- Support for Stored, Deflate, bzip2, LZMA, zstd, and xz compression methods. +- Various different reading approaches (seek, stream, filesystem, in-memory buffer, etc). +- Support for writing complete data (u8 slices) or streams using data descriptors. +- Initial support for ZIP64 reading and writing. +- Aims for reasonable [specification](https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md) compliance. + +## Installation & Basic Usage + +```toml +[dependencies] +async_zip = { version = "0.0.17", features = ["full"] } +``` + +A (soon to be) extensive list of [examples](https://github.com/Majored/rs-async-zip/tree/main/examples) can be found under the `/examples` directory. + +### Feature Flags +- `full` - Enables all below features. +- `full-wasm` - Enables all below features that are compatible with WASM. +- `chrono` - Enables support for parsing dates via `chrono`. +- `tokio` - Enables support for the `tokio` implementation module. +- `tokio-fs` - Enables support for the `tokio::fs` reading module. +- `deflate` - Enables support for the Deflate compression method. +- `bzip2` - Enables support for the bzip2 compression method. +- `lzma` - Enables support for the LZMA compression method. +- `zstd` - Enables support for the zstd compression method. +- `xz` - Enables support for the xz compression method. + +### Reading +```rust +use tokio::{io::BufReader, fs::File}; +use async_zip::tokio::read::seek::ZipFileReader; +... + +let mut file = BufReader::new(File::open("./Archive.zip").await?); +let mut zip = ZipFileReader::with_tokio(&mut file).await?; + +let mut string = String::new(); +let mut reader = zip.reader_with_entry(0).await?; +reader.read_to_string_checked(&mut string).await?; + +println!("{}", string); +``` + +### Writing +```rust +use async_zip::tokio::write::ZipFileWriter; +use async_zip::{Compression, ZipEntryBuilder}; +use tokio::fs::File; +... + +let mut file = File::create("foo.zip").await?; +let mut writer = ZipFileWriter::with_tokio(&mut file); + +let data = b"This is an example file."; +let builder = ZipEntryBuilder::new("bar.txt".into(), Compression::Deflate); + +writer.write_entry_whole(builder, data).await?; +writer.close().await?; +``` + +## Contributions +Whilst I will be continuing to maintain this crate myself, reasonable specification compliance is a huge undertaking for a single individual. As such, contributions will always be encouraged and appreciated. + +No contribution guidelines exist but additions should be developed with readability in mind, with appropriate comments, and make use of `rustfmt`. + +## Issues & Support +Whether you're wanting to report a bug you've come across during use of this crate or are seeking general help/assistance, please utilise the [issues tracker](https://github.com/Majored/rs-async-zip/issues) and provide as much detail as possible (eg. recreation steps). + +I try to respond to issues within a reasonable timeframe. diff --git a/crates/async_zip/SPECIFICATION.md b/crates/async_zip/SPECIFICATION.md new file mode 100644 index 0000000..707311e --- /dev/null +++ b/crates/async_zip/SPECIFICATION.md @@ -0,0 +1,3996 @@ +File: APPNOTE.TXT - .ZIP File Format Specification +Version: 6.3.9 +Status: FINAL - replaces version 6.3.8 +Revised: July 15, 2020 +Copyright (c) 1989 - 2014, 2018, 2019, 2020 PKWARE Inc., All Rights Reserved. + +## 1.0 Introduction + +## 1.1 Purpose + + ### 1.1.1 + This specification is intended to define a cross-platform, + interoperable file storage and transfer format. Since its + first publication in 1989, PKWARE, Inc. ("PKWARE") has remained + committed to ensuring the interoperability of the .ZIP file + format through periodic publication and maintenance of this + specification. We trust that all .ZIP compatible vendors and + application developers that use and benefit from this format + will share and support this commitment to interoperability. + +## 1.2 Scope + + ### 1.2.1 + ZIP is one of the most widely used compressed file formats. It is + universally used to aggregate, compress, and encrypt files into a single + interoperable container. No specific use or application need is + defined by this format and no specific implementation guidance is + provided. This document provides details on the storage format for + creating ZIP files. Information is provided on the records and + fields that describe what a ZIP file is. + +## 1.3 Trademarks + + ### 1.3.1 + PKWARE, PKZIP, Smartcrypt, SecureZIP, and PKSFX are registered + trademarks of PKWARE, Inc. in the United States and elsewhere. + PKPatchMaker, Deflate64, and ZIP64 are trademarks of PKWARE, Inc. + Other marks referenced within this document appear for identification + purposes only and are the property of their respective owners. + + +## 1.4 Permitted Use + + ### 1.4.1 + This document, "APPNOTE.TXT - .ZIP File Format Specification" is the + exclusive property of PKWARE. Use of the information contained in this + document is permitted solely for the purpose of creating products, + programs and processes that read and write files in the ZIP format + subject to the terms and conditions herein. + + ### 1.4.2 + Use of the content of this document within other publications is + permitted only through reference to this document. Any reproduction + or distribution of this document in whole or in part without prior + written permission from PKWARE is strictly prohibited. + + ### 1.4.3 + Certain technological components provided in this document are the + patented proprietary technology of PKWARE and as such require a + separate, executed license agreement from PKWARE. Applicable + components are marked with the following, or similar, statement: + 'Refer to the section in this document entitled "Incorporating + PKWARE Proprietary Technology into Your Product" for more information'. + +## 1.5 Contacting PKWARE + + ### 1.5.1 + If you have questions on this format, its use, or licensing, or if you + wish to report defects, request changes or additions, please contact: + + PKWARE, Inc. + 201 E. Pittsburgh Avenue, Suite 400 + Milwaukee, WI 53204 + +1-414-289-9788 + +1-414-289-9789 FAX + zipformat@pkware.com + + ### 1.5.2 + Information about this format and a reference copy of this document + is publicly available at: + + http://www.pkware.com/appnote + +## 1.6 Disclaimer + + ### 1.6.1 + Although PKWARE will attempt to supply current and accurate + information relating to its file formats, algorithms, and the + subject programs, the possibility of error or omission cannot + be eliminated. PKWARE therefore expressly disclaims any warranty + that the information contained in the associated materials relating + to the subject programs and/or the format of the files created or + accessed by the subject programs and/or the algorithms used by + the subject programs, or any other matter, is current, correct or + accurate as delivered. Any risk of damage due to any possible + inaccurate information is assumed by the user of the information. + Furthermore, the information relating to the subject programs + and/or the file formats created or accessed by the subject + programs and/or the algorithms used by the subject programs is + subject to change without notice. + +## 2.0 Revisions + +## 2.1 Document Status + + ### 2.1.1 + If the STATUS of this file is marked as DRAFT, the content + defines proposed revisions to this specification which may consist + of changes to the ZIP format itself, or that may consist of other + content changes to this document. Versions of this document and + the format in DRAFT form may be subject to modification prior to + publication STATUS of FINAL. DRAFT versions are published periodically + to provide notification to the ZIP community of pending changes and to + provide opportunity for review and comment. + + ### 2.1.2 + Versions of this document having a STATUS of FINAL are + considered to be in the final form for that version of the document + and are not subject to further change until a new, higher version + numbered document is published. Newer versions of this format + specification are intended to remain interoperable with all prior + versions whenever technically possible. + +## 2.2 Change Log + + Version Change Description Date + ------- ------------------ ---------- + 5.2 -Single Password Symmetric Encryption 07/16/2003 + storage + + 6.1.0 + -Smartcard compatibility 01/20/2004 + -Documentation on certificate storage + + 6.2.0 + -Introduction of Central Directory 04/26/2004 + Encryption for encrypting metadata + -Added OS X to Version Made By values + + 6.2.1 + -Added Extra Field placeholder for 04/01/2005 + POSZIP using ID 0x4690 + + -Clarified size field on + "zip64 end of central directory record" + + 6.2.2 + -Documented Final Feature Specification 01/06/2006 + for Strong Encryption + + -Clarifications and typographical + corrections + + 6.3.0 + -Added tape positioning storage 09/29/2006 + parameters + + -Expanded list of supported hash algorithms + + -Expanded list of supported compression + algorithms + + -Expanded list of supported encryption + algorithms + + -Added option for Unicode filename + storage + + -Clarifications for consistent use + of Data Descriptor records + + -Added additional "Extra Field" + definitions + + 6.3.1 + -Corrected standard hash values for 04/11/2007 + SHA-256/384/512 + + 6.3.2 + -Added compression method 97 09/28/2007 + + -Documented InfoZIP "Extra Field" + values for UTF-8 file name and + file comment storage + + 6.3.3 + -Formatting changes to support 09/01/2012 + easier referencing of this APPNOTE + from other documents and standards + + 6.3.4 + -Address change 10/01/2014 + + 6.3.5 + -Documented compression methods 16 11/31/2018 + and 99 (4.4.5, 4.6.1, 5.11, 5.17, + APPENDIX E) + + -Corrected several typographical + errors (2.1.2, 3.2, 4.1.1, 10.2) + + -Marked legacy algorithms as no + longer suitable for use (4.4.5.1) + + -Added clarity on MS DOS time format + (4.4.6) + + -Assign extrafield ID for Timestamps + (4.5.2) + + -Field code description correction (A.2) + + -More consistent use of MAY/SHOULD/MUST + + -Expanded 0x0065 record attribute codes (B.2) + + -Initial information on 0x0022 Extra Data + + 6.3.6 + -Corrected typographical error 04/26/2019 + (4.4.1.3) + + 6.3.7 + -Added Zstandard compression method ID + (4.4.5) + + -Corrected several reported typos + + -Marked intended use for general purpose bit 14 + + -Added Data Stream Alignment Extra Data info + (4.6.11) + + 6.3.8 + -Resolved Zstandard compression method ID conflict + (4.4.5) + + -Added additional compression method ID values in use + + 6.3.9 + -Corrected a typo in Data Stream Alignment description + (4.6.11) + + + + +## 3.0 Notations + + 3.1 Use of the term MUST or SHALL indicates a required element. + + 3.2 MUST NOT or SHALL NOT indicates an element is prohibited from use. + + 3.3 SHOULD indicates a RECOMMENDED element. + + 3.4 SHOULD NOT indicates an element NOT RECOMMENDED for use. + + 3.5 MAY indicates an OPTIONAL element. + + +## 4.0 ZIP Files + +## 4.1 What is a ZIP file + + ### 4.1.1 + ZIP files MAY be identified by the standard .ZIP file extension + although use of a file extension is not required. Use of the + extension .ZIPX is also recognized and MAY be used for ZIP files. + Other common file extensions using the ZIP format include .JAR, .WAR, + .DOCX, .XLSX, .PPTX, .ODT, .ODS, .ODP and others. Programs reading or + writing ZIP files SHOULD rely on internal record signatures described + in this document to identify files in this format. + + ### 4.1.2 + ZIP files SHOULD contain at least one file and MAY contain + multiple files. + + ### 4.1.3 + Data compression MAY be used to reduce the size of files + placed into a ZIP file, but is not required. This format supports the + use of multiple data compression algorithms. When compression is used, + one of the documented compression algorithms MUST be used. Implementors + are advised to experiment with their data to determine which of the + available algorithms provides the best compression for their needs. + Compression method 8 (Deflate) is the method used by default by most + ZIP compatible application programs. + + + ### 4.1.4 + Data encryption MAY be used to protect files within a ZIP file. + Keying methods supported for encryption within this format include + passwords and public/private keys. Either MAY be used individually + or in combination. Encryption MAY be applied to individual files. + Additional security MAY be used through the encryption of ZIP file + metadata stored within the Central Directory. See the section on the + Strong Encryption Specification for information. Refer to the section + in this document entitled "Incorporating PKWARE Proprietary Technology + into Your Product" for more information. + + ### 4.1.5 + Data integrity MUST be provided for each file using CRC32. + + ### 4.1.6 + Additional data integrity MAY be included through the use of + digital signatures. Individual files MAY be signed with one or more + digital signatures. The Central Directory, if signed, MUST use a + single signature. + + ### 4.1.7 + Files MAY be placed within a ZIP file uncompressed or stored. + The term "stored" as used in the context of this document means the file + is copied into the ZIP file uncompressed. + + ### 4.1.8 + Each data file placed into a ZIP file MAY be compressed, stored, + encrypted or digitally signed independent of how other data files in the + same ZIP file are archived. + + ### 4.1.9 + ZIP files MAY be streamed, split into segments (on fixed or on + removable media) or "self-extracting". Self-extracting ZIP + files MUST include extraction code for a target platform within + the ZIP file. + + ### 4.1.10 + Extensibility is provided for platform or application specific + needs through extra data fields that MAY be defined for custom + purposes. Extra data definitions MUST NOT conflict with existing + documented record definitions. + + ### 4.1.11 + Common uses for ZIP MAY also include the use of manifest files. + Manifest files store application specific information within a file stored + within the ZIP file. This manifest file SHOULD be the first file in the + ZIP file. This specification does not provide any information or guidance on + the use of manifest files within ZIP files. Refer to the application developer + for information on using manifest files and for any additional profile + information on using ZIP within an application. + + ### 4.1.12 + ZIP files MAY be placed within other ZIP files. + +## 4.2 ZIP Metadata + + ### 4.2.1 + ZIP files are identified by metadata consisting of defined record types + containing the storage information necessary for maintaining the files + placed into a ZIP file. Each record type MUST be identified using a header + signature that identifies the record type. Signature values begin with the + two byte constant marker of 0x4b50, representing the characters "PK". + + +## 4.3 General Format of a .ZIP file + + ### 4.3.1 + A ZIP file MUST contain an "end of central directory record". A ZIP + file containing only an "end of central directory record" is considered an + empty ZIP file. Files MAY be added or replaced within a ZIP file, or deleted. + A ZIP file MUST have only one "end of central directory record". Other + records defined in this specification MAY be used as needed to support + storage requirements for individual ZIP files. + + ### 4.3.2 + Each file placed into a ZIP file MUST be preceded by a "local + file header" record for that file. Each "local file header" MUST be + accompanied by a corresponding "central directory header" record within + the central directory section of the ZIP file. + + ### 4.3.3 + Files MAY be stored in arbitrary order within a ZIP file. A ZIP + file MAY span multiple volumes or it MAY be split into user-defined + segment sizes. All values MUST be stored in little-endian byte order unless + otherwise specified in this document for a specific data element. + + ### 4.3.4 + Compression MUST NOT be applied to a "local file header", an "encryption + header", or an "end of central directory record". Individual "central + directory records" MUST NOT be compressed, but the aggregate of all central + directory records MAY be compressed. + + ### 4.3.5 + File data MAY be followed by a "data descriptor" for the file. Data + descriptors are used to facilitate ZIP file streaming. + + + ### 4.3.6 + Overall .ZIP file format: + + [local file header 1] + [encryption header 1] + [file data 1] + [data descriptor 1] + . + . + . + [local file header n] + [encryption header n] + [file data n] + [data descriptor n] + [archive decryption header] + [archive extra data record] + [central directory header 1] + . + . + . + [central directory header n] + [zip64 end of central directory record] + [zip64 end of central directory locator] + [end of central directory record] + + + ### 4.3.7 + Local file header: + + local file header signature 4 bytes (0x04034b50) + version needed to extract 2 bytes + general purpose bit flag 2 bytes + compression method 2 bytes + last mod file time 2 bytes + last mod file date 2 bytes + crc-32 4 bytes + compressed size 4 bytes + uncompressed size 4 bytes + file name length 2 bytes + extra field length 2 bytes + + file name (variable size) + extra field (variable size) + + ### 4.3.8 + File data + + Immediately following the local header for a file + SHOULD be placed the compressed or stored data for the file. + If the file is encrypted, the encryption header for the file + SHOULD be placed after the local header and before the file + data. The series of [local file header][encryption header] + [file data][data descriptor] repeats for each file in the + .ZIP archive. + + Zero-byte files, directories, and other file types that + contain no content MUST NOT include file data. + + ### 4.3.9 + Data descriptor: + + crc-32 4 bytes + compressed size 4 bytes + uncompressed size 4 bytes + +### 4.3.9.1 +This descriptor MUST exist if bit 3 of the general + purpose bit flag is set (see below). It is byte aligned + and immediately follows the last byte of compressed data. + This descriptor SHOULD be used only when it was not possible to + seek in the output .ZIP file, e.g., when the output .ZIP file + was standard output or a non-seekable device. For ZIP64(tm) format + archives, the compressed and uncompressed sizes are 8 bytes each. + +### 4.3.9.2 +When compressing files, compressed and uncompressed sizes + SHOULD be stored in ZIP64 format (as 8 byte values) when a + file's size exceeds 0xFFFFFFFF. However ZIP64 format MAY be + used regardless of the size of a file. When extracting, if + the zip64 extended information extra field is present for + the file the compressed and uncompressed sizes will be 8 + byte values. + +### 4.3.9.3 +Although not originally assigned a signature, the value + 0x08074b50 has commonly been adopted as a signature value + for the data descriptor record. Implementers SHOULD be + aware that ZIP files MAY be encountered with or without this + signature marking data descriptors and SHOULD account for + either case when reading ZIP files to ensure compatibility. + +### 4.3.9.4 +When writing ZIP files, implementors SHOULD include the + signature value marking the data descriptor record. When + the signature is used, the fields currently defined for + the data descriptor record will immediately follow the + signature. + +### 4.3.9.5 +An extensible data descriptor will be released in a + future version of this APPNOTE. This new record is intended to + resolve conflicts with the use of this record going forward, + and to provide better support for streamed file processing. + +### 4.3.9.6 +When the Central Directory Encryption method is used, + the data descriptor record is not required, but MAY be used. + If present, and bit 3 of the general purpose bit field is set to + indicate its presence, the values in fields of the data descriptor + record MUST be set to binary zeros. See the section on the Strong + Encryption Specification for information. Refer to the section in + this document entitled "Incorporating PKWARE Proprietary Technology + into Your Product" for more information. + + + ### 4.3.10 + Archive decryption header: + +### 4.3.10.1 +The Archive Decryption Header is introduced in version 6.2 + of the ZIP format specification. This record exists in support + of the Central Directory Encryption Feature implemented as part of + the Strong Encryption Specification as described in this document. + When the Central Directory Structure is encrypted, this decryption + header MUST precede the encrypted data segment. + +### 4.3.10.2 +The encrypted data segment SHALL consist of the Archive + extra data record (if present) and the encrypted Central Directory + Structure data. The format of this data record is identical to the + Decryption header record preceding compressed file data. If the + central directory structure is encrypted, the location of the start of + this data record is determined using the Start of Central Directory + field in the Zip64 End of Central Directory record. See the + section on the Strong Encryption Specification for information + on the fields used in the Archive Decryption Header record. + Refer to the section in this document entitled "Incorporating + PKWARE Proprietary Technology into Your Product" for more information. + + + ### 4.3.11 + Archive extra data record: + + archive extra data signature 4 bytes (0x08064b50) + extra field length 4 bytes + extra field data (variable size) + +### 4.3.11.1 +The Archive Extra Data Record is introduced in version 6.2 + of the ZIP format specification. This record MAY be used in support + of the Central Directory Encryption Feature implemented as part of + the Strong Encryption Specification as described in this document. + When present, this record MUST immediately precede the central + directory data structure. + +### 4.3.11.2 +The size of this data record SHALL be included in the + Size of the Central Directory field in the End of Central + Directory record. If the central directory structure is compressed, + but not encrypted, the location of the start of this data record is + determined using the Start of Central Directory field in the Zip64 + End of Central Directory record. Refer to the section in this document + entitled "Incorporating PKWARE Proprietary Technology into Your + Product" for more information. + + ### 4.3.12 + Central directory structure: + + [central directory header 1] + . + . + . + [central directory header n] + [digital signature] + + File header: + + central file header signature 4 bytes (0x02014b50) + version made by 2 bytes + version needed to extract 2 bytes + general purpose bit flag 2 bytes + compression method 2 bytes + last mod file time 2 bytes + last mod file date 2 bytes + crc-32 4 bytes + compressed size 4 bytes + uncompressed size 4 bytes + file name length 2 bytes + extra field length 2 bytes + file comment length 2 bytes + disk number start 2 bytes + internal file attributes 2 bytes + external file attributes 4 bytes + relative offset of local header 4 bytes + + file name (variable size) + extra field (variable size) + file comment (variable size) + + ### 4.3.13 + Digital signature: + + header signature 4 bytes (0x05054b50) + size of data 2 bytes + signature data (variable size) + + With the introduction of the Central Directory Encryption + feature in version 6.2 of this specification, the Central + Directory Structure MAY be stored both compressed and encrypted. + Although not required, it is assumed when encrypting the + Central Directory Structure, that it will be compressed + for greater storage efficiency. Information on the + Central Directory Encryption feature can be found in the section + describing the Strong Encryption Specification. The Digital + Signature record will be neither compressed nor encrypted. + + ### 4.3.14 + Zip64 end of central directory record + + zip64 end of central dir + signature 4 bytes (0x06064b50) + size of zip64 end of central + directory record 8 bytes + version made by 2 bytes + version needed to extract 2 bytes + number of this disk 4 bytes + number of the disk with the + start of the central directory 4 bytes + total number of entries in the + central directory on this disk 8 bytes + total number of entries in the + central directory 8 bytes + size of the central directory 8 bytes + offset of start of central + directory with respect to + the starting disk number 8 bytes + zip64 extensible data sector (variable size) + +### 4.3.14.1 +The value stored into the "size of zip64 end of central + directory record" SHOULD be the size of the remaining + record and SHOULD NOT include the leading 12 bytes. + + Size = SizeOfFixedFields + SizeOfVariableData - 12. + +### 4.3.14.2 +The above record structure defines Version 1 of the + zip64 end of central directory record. Version 1 was + implemented in versions of this specification preceding + 6.2 in support of the ZIP64 large file feature. The + introduction of the Central Directory Encryption feature + implemented in version 6.2 as part of the Strong Encryption + Specification defines Version 2 of this record structure. + Refer to the section describing the Strong Encryption + Specification for details on the version 2 format for + this record. Refer to the section in this document entitled + "Incorporating PKWARE Proprietary Technology into Your Product" + for more information applicable to use of Version 2 of this + record. + +### 4.3.14.3 +Special purpose data MAY reside in the zip64 extensible + data sector field following either a V1 or V2 version of this + record. To ensure identification of this special purpose data + it MUST include an identifying header block consisting of the + following: + + Header ID - 2 bytes + Data Size - 4 bytes + + The Header ID field indicates the type of data that is in the + data block that follows. + + Data Size identifies the number of bytes that follow for this + data block type. + +### 4.3.14.4 +Multiple special purpose data blocks MAY be present. + Each MUST be preceded by a Header ID and Data Size field. Current + mappings of Header ID values supported in this field are as + defined in APPENDIX C. + + ### 4.3.15 + Zip64 end of central directory locator + + zip64 end of central dir locator + signature 4 bytes (0x07064b50) + number of the disk with the + start of the zip64 end of + central directory 4 bytes + relative offset of the zip64 + end of central directory record 8 bytes + total number of disks 4 bytes + + ### 4.3.16 + End of central directory record: + + end of central dir signature 4 bytes (0x06054b50) + number of this disk 2 bytes + number of the disk with the + start of the central directory 2 bytes + total number of entries in the + central directory on this disk 2 bytes + total number of entries in + the central directory 2 bytes + size of the central directory 4 bytes + offset of start of central + directory with respect to + the starting disk number 4 bytes + .ZIP file comment length 2 bytes + .ZIP file comment (variable size) + +## 4.4 Explanation of fields + + ### 4.4.1 + General notes on fields + +### 4.4.1.1 + All fields unless otherwise noted are unsigned and stored + in Intel low-byte:high-byte, low-word:high-word order. + +### 4.4.1.2 + String fields are not null terminated, since the length + is given explicitly. + +### 4.4.1.3 + The entries in the central directory MAY NOT necessarily + be in the same order that files appear in the .ZIP file. + +### 4.4.1.4 + If one of the fields in the end of central directory + record is too small to hold required data, the field SHOULD be + set to -1 (0xFFFF or 0xFFFFFFFF) and the ZIP64 format record + SHOULD be created. + +### 4.4.1.5 + The end of central directory record and the Zip64 end + of central directory locator record MUST reside on the same + disk when splitting or spanning an archive. + + ### 4.4.2 + version made by (2 bytes) + + ### 4.4.2.1 +The upper byte indicates the compatibility of the file + attribute information. If the external file attributes + are compatible with MS-DOS and can be read by PKZIP for + DOS version 2.04g then this value will be zero. If these + attributes are not compatible, then this value will + identify the host system on which the attributes are + compatible. Software can use this information to determine + the line record format for text files etc. + + ### 4.4.2.2 +The current mappings are: + + 0 - MS-DOS and OS/2 (FAT / VFAT / FAT32 file systems) + 1 - Amiga 2 - OpenVMS + 3 - UNIX 4 - VM/CMS + 5 - Atari ST 6 - OS/2 H.P.F.S. + 7 - Macintosh 8 - Z-System + 9 - CP/M 10 - Windows NTFS + 11 - MVS (OS/390 - Z/OS) 12 - VSE + 13 - Acorn Risc 14 - VFAT + 15 - alternate MVS 16 - BeOS + 17 - Tandem 18 - OS/400 + 19 - OS X (Darwin) 20 thru 255 - unused + + ### 4.4.2.3 +The lower byte indicates the ZIP specification version + (the version of this document) supported by the software + used to encode the file. The value/10 indicates the major + version number, and the value mod 10 is the minor version + number. + + ### 4.4.3 + version needed to extract (2 bytes) + + ### 4.4.3.1 +The minimum supported ZIP specification version needed + to extract the file, mapped as above. This value is based on + the specific format features a ZIP program MUST support to + be able to extract the file. If multiple features are + applied to a file, the minimum version MUST be set to the + feature having the highest value. New features or feature + changes affecting the published format specification will be + implemented using higher version numbers than the last + published value to avoid conflict. + + ### 4.4.3.2 +Current minimum feature versions are as defined below: + + 1.0 - Default value + 1.1 - File is a volume label + 2.0 - File is a folder (directory) + 2.0 - File is compressed using Deflate compression + 2.0 - File is encrypted using traditional PKWARE encryption + 2.1 - File is compressed using Deflate64(tm) + 2.5 - File is compressed using PKWARE DCL Implode + 2.7 - File is a patch data set + 4.5 - File uses ZIP64 format extensions + 4.6 - File is compressed using BZIP2 compression* + 5.0 - File is encrypted using DES + 5.0 - File is encrypted using 3DES + 5.0 - File is encrypted using original RC2 encryption + 5.0 - File is encrypted using RC4 encryption + 5.1 - File is encrypted using AES encryption + 5.1 - File is encrypted using corrected RC2 encryption** + 5.2 - File is encrypted using corrected RC2-64 encryption** + 6.1 - File is encrypted using non-OAEP key wrapping*** + 6.2 - Central directory encryption + 6.3 - File is compressed using LZMA + 6.3 - File is compressed using PPMd+ + 6.3 - File is encrypted using Blowfish + 6.3 - File is encrypted using Twofish + + ### 4.4.3.3 +Notes on version needed to extract + + * Early 7.x (pre-7.2) versions of PKZIP incorrectly set the + version needed to extract for BZIP2 compression to be 50 + when it SHOULD have been 46. + + ** Refer to the section on Strong Encryption Specification + for additional information regarding RC2 corrections. + + *** Certificate encryption using non-OAEP key wrapping is the + intended mode of operation for all versions beginning with 6.1. + Support for OAEP key wrapping MUST only be used for + backward compatibility when sending ZIP files to be opened by + versions of PKZIP older than 6.1 (5.0 or 6.0). + + + Files compressed using PPMd MUST set the version + needed to extract field to 6.3, however, not all ZIP + programs enforce this and MAY be unable to decompress + data files compressed using PPMd if this value is set. + + When using ZIP64 extensions, the corresponding value in the + zip64 end of central directory record MUST also be set. + This field SHOULD be set appropriately to indicate whether + Version 1 or Version 2 format is in use. + + + ### 4.4.4 + general purpose bit flag: (2 bytes) + + Bit 0: If set, indicates that the file is encrypted. + + (For Method 6 - Imploding) + Bit 1: If the compression method used was type 6, + Imploding, then this bit, if set, indicates + an 8K sliding dictionary was used. If clear, + then a 4K sliding dictionary was used. + + Bit 2: If the compression method used was type 6, + Imploding, then this bit, if set, indicates + 3 Shannon-Fano trees were used to encode the + sliding dictionary output. If clear, then 2 + Shannon-Fano trees were used. + + (For Methods 8 and 9 - Deflating) + Bit 2 Bit 1 + 0 0 Normal (-en) compression option was used. + 0 1 Maximum (-exx/-ex) compression option was used. + 1 0 Fast (-ef) compression option was used. + 1 1 Super Fast (-es) compression option was used. + + (For Method 14 - LZMA) + Bit 1: If the compression method used was type 14, + LZMA, then this bit, if set, indicates + an end-of-stream (EOS) marker is used to + mark the end of the compressed data stream. + If clear, then an EOS marker is not present + and the compressed data size must be known + to extract. + + Note: Bits 1 and 2 are undefined if the compression + method is any other. + + Bit 3: If this bit is set, the fields crc-32, compressed + size and uncompressed size are set to zero in the + local header. The correct values are put in the + data descriptor immediately following the compressed + data. (Note: PKZIP version 2.04g for DOS only + recognizes this bit for method 8 compression, newer + versions of PKZIP recognize this bit for any + compression method.) + + Bit 4: Reserved for use with method 8, for enhanced + deflating. + + Bit 5: If this bit is set, this indicates that the file is + compressed patched data. (Note: Requires PKZIP + version 2.70 or greater) + + Bit 6: Strong encryption. If this bit is set, you MUST + set the version needed to extract value to at least + 50 and you MUST also set bit 0. If AES encryption + is used, the version needed to extract value MUST + be at least 51. See the section describing the Strong + Encryption Specification for details. Refer to the + section in this document entitled "Incorporating PKWARE + Proprietary Technology into Your Product" for more + information. + + Bit 7: Currently unused. + + Bit 8: Currently unused. + + Bit 9: Currently unused. + + Bit 10: Currently unused. + + Bit 11: Language encoding flag (EFS). If this bit is set, + the filename and comment fields for this file + MUST be encoded using UTF-8. (see APPENDIX D) + + Bit 12: Reserved by PKWARE for enhanced compression. + + Bit 13: Set when encrypting the Central Directory to indicate + selected data values in the Local Header are masked to + hide their actual values. See the section describing + the Strong Encryption Specification for details. Refer + to the section in this document entitled "Incorporating + PKWARE Proprietary Technology into Your Product" for + more information. + + Bit 14: Reserved by PKWARE for alternate streams. + + Bit 15: Reserved by PKWARE. + + ### 4.4.5 + compression method: (2 bytes) + + 0 - The file is stored (no compression) + 1 - The file is Shrunk + 2 - The file is Reduced with compression factor 1 + 3 - The file is Reduced with compression factor 2 + 4 - The file is Reduced with compression factor 3 + 5 - The file is Reduced with compression factor 4 + 6 - The file is Imploded + 7 - Reserved for Tokenizing compression algorithm + 8 - The file is Deflated + 9 - Enhanced Deflating using Deflate64(tm) + 10 - PKWARE Data Compression Library Imploding (old IBM TERSE) + 11 - Reserved by PKWARE + 12 - File is compressed using BZIP2 algorithm + 13 - Reserved by PKWARE + 14 - LZMA + 15 - Reserved by PKWARE + 16 - IBM z/OS CMPSC Compression + 17 - Reserved by PKWARE + 18 - File is compressed using IBM TERSE (new) + 19 - IBM LZ77 z Architecture + 20 - deprecated (use method 93 for zstd) + 93 - Zstandard (zstd) Compression + 94 - MP3 Compression + 95 - XZ Compression + 96 - JPEG variant + 97 - WavPack compressed data + 98 - PPMd version I, Rev 1 + 99 - AE-x encryption marker (see APPENDIX E) + + ### 4.4.5.1 +Methods 1-6 are legacy algorithms and are no longer + recommended for use when compressing files. + + ### 4.4.6 + date and time fields: (2 bytes each) + + The date and time are encoded in standard MS-DOS format. + If input came from standard input, the date and time are + those at which compression was started for this data. + If encrypting the central directory and general purpose bit + flag 13 is set indicating masking, the value stored in the + Local Header will be zero. MS-DOS time format is different + from more commonly used computer time formats such as + UTC. For example, MS-DOS uses year values relative to 1980 + and 2 second precision. + + ### 4.4.7 + CRC-32: (4 bytes) + + The CRC-32 algorithm was generously contributed by + David Schwaderer and can be found in his excellent + book "C Programmers Guide to NetBIOS" published by + Howard W. Sams & Co. Inc. The 'magic number' for + the CRC is 0xdebb20e3. The proper CRC pre and post + conditioning is used, meaning that the CRC register + is pre-conditioned with all ones (a starting value + of 0xffffffff) and the value is post-conditioned by + taking the one's complement of the CRC residual. + If bit 3 of the general purpose flag is set, this + field is set to zero in the local header and the correct + value is put in the data descriptor and in the central + directory. When encrypting the central directory, if the + local header is not in ZIP64 format and general purpose + bit flag 13 is set indicating masking, the value stored + in the Local Header will be zero. + + ### 4.4.8 + compressed size: (4 bytes) + ### 4.4.9 + uncompressed size: (4 bytes) + + The size of the file compressed (4.4.8) and uncompressed, + (4.4.9) respectively. When a decryption header is present it + will be placed in front of the file data and the value of the + compressed file size will include the bytes of the decryption + header. If bit 3 of the general purpose bit flag is set, + these fields are set to zero in the local header and the + correct values are put in the data descriptor and + in the central directory. If an archive is in ZIP64 format + and the value in this field is 0xFFFFFFFF, the size will be + in the corresponding 8 byte ZIP64 extended information + extra field. When encrypting the central directory, if the + local header is not in ZIP64 format and general purpose bit + flag 13 is set indicating masking, the value stored for the + uncompressed size in the Local Header will be zero. + + ### 4.4.10 + file name length: (2 bytes) + ### 4.4.11 + extra field length: (2 bytes) + ### 4.4.12 + file comment length: (2 bytes) + + The length of the file name, extra field, and comment + fields respectively. The combined length of any + directory record and these three fields SHOULD NOT + generally exceed 65,535 bytes. If input came from standard + input, the file name length is set to zero. + + + ### 4.4.13 + disk number start: (2 bytes) + + The number of the disk on which this file begins. If an + archive is in ZIP64 format and the value in this field is + 0xFFFF, the size will be in the corresponding 4 byte zip64 + extended information extra field. + + ### 4.4.14 + internal file attributes: (2 bytes) + + Bits 1 and 2 are reserved for use by PKWARE. + + ### 4.4.14.1 +The lowest bit of this field indicates, if set, + that the file is apparently an ASCII or text file. If not + set, that the file apparently contains binary data. + The remaining bits are unused in version 1.0. + + ### 4.4.14.2 +The 0x0002 bit of this field indicates, if set, that + a 4 byte variable record length control field precedes each + logical record indicating the length of the record. The + record length control field is stored in little-endian byte + order. This flag is independent of text control characters, + and if used in conjunction with text data, includes any + control characters in the total length of the record. This + value is provided for mainframe data transfer support. + + ### 4.4.15 + external file attributes: (4 bytes) + + The mapping of the external attributes is + host-system dependent (see 'version made by'). For + MS-DOS, the low order byte is the MS-DOS directory + attribute byte. If input came from standard input, this + field is set to zero. + + ### 4.4.16 + relative offset of local header: (4 bytes) + + This is the offset from the start of the first disk on + which this file appears, to where the local header SHOULD + be found. If an archive is in ZIP64 format and the value + in this field is 0xFFFFFFFF, the size will be in the + corresponding 8 byte zip64 extended information extra field. + + ### 4.4.17 + file name: (Variable) + + ### 4.4.17.1 +The name of the file, with optional relative path. + The path stored MUST NOT contain a drive or + device letter, or a leading slash. All slashes + MUST be forward slashes '/' as opposed to + backwards slashes '\' for compatibility with Amiga + and UNIX file systems etc. If input came from standard + input, there is no file name field. + + ### 4.4.17.2 +If using the Central Directory Encryption Feature and + general purpose bit flag 13 is set indicating masking, the file + name stored in the Local Header will not be the actual file name. + A masking value consisting of a unique hexadecimal value will + be stored. This value will be sequentially incremented for each + file in the archive. See the section on the Strong Encryption + Specification for details on retrieving the encrypted file name. + Refer to the section in this document entitled "Incorporating PKWARE + Proprietary Technology into Your Product" for more information. + + + ### 4.4.18 + file comment: (Variable) + + The comment for this file. + + ### 4.4.19 + number of this disk: (2 bytes) + + The number of this disk, which contains central + directory end record. If an archive is in ZIP64 format + and the value in this field is 0xFFFF, the size will + be in the corresponding 4 byte zip64 end of central + directory field. + + + ### 4.4.20 + number of the disk with the start of the central + directory: (2 bytes) + + The number of the disk on which the central + directory starts. If an archive is in ZIP64 format + and the value in this field is 0xFFFF, the size will + be in the corresponding 4 byte zip64 end of central + directory field. + + ### 4.4.21 + total number of entries in the central dir on + this disk: (2 bytes) + + The number of central directory entries on this disk. + If an archive is in ZIP64 format and the value in + this field is 0xFFFF, the size will be in the + corresponding 8 byte zip64 end of central + directory field. + + ### 4.4.22 + total number of entries in the central dir: (2 bytes) + + The total number of files in the .ZIP file. If an + archive is in ZIP64 format and the value in this field + is 0xFFFF, the size will be in the corresponding 8 byte + zip64 end of central directory field. + + ### 4.4.23 + size of the central directory: (4 bytes) + + The size (in bytes) of the entire central directory. + If an archive is in ZIP64 format and the value in + this field is 0xFFFFFFFF, the size will be in the + corresponding 8 byte zip64 end of central + directory field. + + ### 4.4.24 + offset of start of central directory with respect to + the starting disk number: (4 bytes) + + Offset of the start of the central directory on the + disk on which the central directory starts. If an + archive is in ZIP64 format and the value in this + field is 0xFFFFFFFF, the size will be in the + corresponding 8 byte zip64 end of central + directory field. + + ### 4.4.25 + .ZIP file comment length: (2 bytes) + + The length of the comment for this .ZIP file. + + ### 4.4.26 + .ZIP file comment: (Variable) + + The comment for this .ZIP file. ZIP file comment data + is stored unsecured. No encryption or data authentication + is applied to this area at this time. Confidential information + SHOULD NOT be stored in this section. + + ### 4.4.27 + zip64 extensible data sector (variable size) + + (currently reserved for use by PKWARE) + + + ### 4.4.28 + extra field: (Variable) + + This SHOULD be used for storage expansion. If additional + information needs to be stored within a ZIP file for special + application or platform needs, it SHOULD be stored here. + Programs supporting earlier versions of this specification can + then safely skip the file, and find the next file or header. + This field will be 0 length in version 1.0. + + Existing extra fields are defined in the section + Extensible data fields that follows. + +## 4.5 Extensible data fields + + ### 4.5.1 + In order to allow different programs and different types + of information to be stored in the 'extra' field in .ZIP + files, the following structure MUST be used for all + programs storing data in this field: + + header1+data1 + header2+data2 . . . + + Each header MUST consist of: + + Header ID - 2 bytes + Data Size - 2 bytes + + Note: all fields stored in Intel low-byte/high-byte order. + + The Header ID field indicates the type of data that is in + the following data block. + + Header IDs of 0 thru 31 are reserved for use by PKWARE. + The remaining IDs can be used by third party vendors for + proprietary usage. + + ### 4.5.2 + The current Header ID mappings defined by PKWARE are: + + 0x0001 Zip64 extended information extra field + 0x0007 AV Info + 0x0008 Reserved for extended language encoding data (PFS) + (see APPENDIX D) + 0x0009 OS/2 + 0x000a NTFS + 0x000c OpenVMS + 0x000d UNIX + 0x000e Reserved for file stream and fork descriptors + 0x000f Patch Descriptor + 0x0014 PKCS#7 Store for X.509 Certificates + 0x0015 X.509 Certificate ID and Signature for + individual file + 0x0016 X.509 Certificate ID for Central Directory + 0x0017 Strong Encryption Header + 0x0018 Record Management Controls + 0x0019 PKCS#7 Encryption Recipient Certificate List + 0x0020 Reserved for Timestamp record + 0x0021 Policy Decryption Key Record + 0x0022 Smartcrypt Key Provider Record + 0x0023 Smartcrypt Policy Key Data Record + 0x0065 IBM S/390 (Z390), AS/400 (I400) attributes + - uncompressed + 0x0066 Reserved for IBM S/390 (Z390), AS/400 (I400) + attributes - compressed + 0x4690 POSZIP 4690 (reserved) + + + ### 4.5.3 + -Zip64 Extended Information Extra Field (0x0001): + + The following is the layout of the zip64 extended + information "extra" block. If one of the size or + offset fields in the Local or Central directory + record is too small to hold the required data, + a Zip64 extended information record is created. + The order of the fields in the zip64 extended + information record is fixed, but the fields MUST + only appear if the corresponding Local or Central + directory record field is set to 0xFFFF or 0xFFFFFFFF. + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(ZIP64) 0x0001 2 bytes Tag for this "extra" block type + Size 2 bytes Size of this "extra" block + Original + Size 8 bytes Original uncompressed file size + Compressed + Size 8 bytes Size of compressed data + Relative Header + Offset 8 bytes Offset of local header record + Disk Start + Number 4 bytes Number of the disk on which + this file starts + + This entry in the Local header MUST include BOTH original + and compressed file size fields. If encrypting the + central directory and bit 13 of the general purpose bit + flag is set indicating masking, the value stored in the + Local Header for the original file size will be zero. + + + ### 4.5.4 + -OS/2 Extra Field (0x0009): + + The following is the layout of the OS/2 attributes "extra" + block. (Last Revision 09/05/95) + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(OS/2) 0x0009 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + BSize 4 bytes Uncompressed Block Size + CType 2 bytes Compression type + EACRC 4 bytes CRC value for uncompress block + (var) variable Compressed block + + The OS/2 extended attribute structure (FEA2LIST) is + compressed and then stored in its entirety within this + structure. There will only ever be one "block" of data in + VarFields[]. + + ### 4.5.5 + -NTFS Extra Field (0x000a): + + The following is the layout of the NTFS attributes + "extra" block. (Note: At this time the Mtime, Atime + and Ctime values MAY be used on any WIN32 system.) + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(NTFS) 0x000a 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of the total "extra" block + Reserved 4 bytes Reserved for future use + Tag1 2 bytes NTFS attribute tag value #1 + Size1 2 bytes Size of attribute #1, in bytes + (var) Size1 Attribute #1 data + . + . + . + TagN 2 bytes NTFS attribute tag value #N + SizeN 2 bytes Size of attribute #N, in bytes + (var) SizeN Attribute #N data + + For NTFS, values for Tag1 through TagN are as follows: + (currently only one set of attributes is defined for NTFS) + + Tag Size Description + ----- ---- ----------- + 0x0001 2 bytes Tag for attribute #1 + Size1 2 bytes Size of attribute #1, in bytes + Mtime 8 bytes File last modification time + Atime 8 bytes File last access time + Ctime 8 bytes File creation time + + ### 4.5.6 + -OpenVMS Extra Field (0x000c): + + The following is the layout of the OpenVMS attributes + "extra" block. + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- + (VMS) 0x000c 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of the total "extra" block + CRC 4 bytes 32-bit CRC for remainder of the block + Tag1 2 bytes OpenVMS attribute tag value #1 + Size1 2 bytes Size of attribute #1, in bytes + (var) Size1 Attribute #1 data + . + . + . + TagN 2 bytes OpenVMS attribute tag value #N + SizeN 2 bytes Size of attribute #N, in bytes + (var) SizeN Attribute #N data + + OpenVMS Extra Field Rules: + +### 4.5.6.1. +There will be one or more attributes present, which + will each be preceded by the above TagX & SizeX values. + These values are identical to the ATR$C_XXXX and ATR$S_XXXX + constants which are defined in ATR.H under OpenVMS C. Neither + of these values will ever be zero. + +### 4.5.6.2. +No word alignment or padding is performed. + +### 4.5.6.3. +A well-behaved PKZIP/OpenVMS program SHOULD NOT produce + more than one sub-block with the same TagX value. Also, there MUST + NOT be more than one "extra" block of type 0x000c in a particular + directory record. + + ### 4.5.7 + -UNIX Extra Field (0x000d): + + The following is the layout of the UNIX "extra" block. + Note: all fields are stored in Intel low-byte/high-byte + order. + + Value Size Description + ----- ---- ----------- +(UNIX) 0x000d 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + Atime 4 bytes File last access time + Mtime 4 bytes File last modification time + Uid 2 bytes File user ID + Gid 2 bytes File group ID + (var) variable Variable length data field + + The variable length data field will contain file type + specific data. Currently the only values allowed are + the original "linked to" file names for hard or symbolic + links, and the major and minor device node numbers for + character and block device nodes. Since device nodes + cannot be either symbolic or hard links, only one set of + variable length data is stored. Link files will have the + name of the original file stored. This name is NOT NULL + terminated. Its size can be determined by checking TSize - + 12. Device entries will have eight bytes stored as two 4 + byte entries (in little endian format). The first entry + will be the major device number, and the second the minor + device number. + + ### 4.5.8 + -PATCH Descriptor Extra Field (0x000f): + + ### 4.5.8.1 +The following is the layout of the Patch Descriptor + "extra" block. + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(Patch) 0x000f 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of the total "extra" block + Version 2 bytes Version of the descriptor + Flags 4 bytes Actions and reactions (see below) + OldSize 4 bytes Size of the file about to be patched + OldCRC 4 bytes 32-bit CRC of the file to be patched + NewSize 4 bytes Size of the resulting file + NewCRC 4 bytes 32-bit CRC of the resulting file + + ### 4.5.8.2 +Actions and reactions + + Bits Description + ---- ---------------- + 0 Use for auto detection + 1 Treat as a self-patch + 2-3 RESERVED + 4-5 Action (see below) + 6-7 RESERVED + 8-9 Reaction (see below) to absent file + 10-11 Reaction (see below) to newer file + 12-13 Reaction (see below) to unknown file + 14-15 RESERVED + 16-31 RESERVED + + ### 4.5.8.2.1 + Actions + + Action Value + ------ ----- + none 0 + add 1 + delete 2 + patch 3 + + ### 4.5.8.2.2 + Reactions + + Reaction Value + -------- ----- + ask 0 + skip 1 + ignore 2 + fail 3 + + ### 4.5.8.3 +Patch support is provided by PKPatchMaker(tm) technology + and is covered under U.S. Patents and Patents Pending. The use or + implementation in a product of certain technological aspects set + forth in the current APPNOTE, including those with regard to + strong encryption or patching requires a license from PKWARE. + Refer to the section in this document entitled "Incorporating + PKWARE Proprietary Technology into Your Product" for more + information. + + ### 4.5.9 + -PKCS#7 Store for X.509 Certificates (0x0014): + + This field MUST contain information about each of the certificates + files MAY be signed with. When the Central Directory Encryption + feature is enabled for a ZIP file, this record will appear in + the Archive Extra Data Record, otherwise it will appear in the + first central directory record and will be ignored in any + other record. + + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(Store) 0x0014 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of the store data + TData TSize Data about the store + + + ### 4.5.10 + -X.509 Certificate ID and Signature for individual file (0x0015): + + This field contains the information about which certificate in + the PKCS#7 store was used to sign a particular file. It also + contains the signature data. This field can appear multiple + times, but can only appear once per certificate. + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(CID) 0x0015 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of data that follows + TData TSize Signature Data + + ### 4.5.11 + -X.509 Certificate ID and Signature for central directory (0x0016): + + This field contains the information about which certificate in + the PKCS#7 store was used to sign the central directory structure. + When the Central Directory Encryption feature is enabled for a + ZIP file, this record will appear in the Archive Extra Data Record, + otherwise it will appear in the first central directory record. + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(CDID) 0x0016 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of data that follows + TData TSize Data + + ### 4.5.12 + -Strong Encryption Header (0x0017): + + Value Size Description + ----- ---- ----------- + 0x0017 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of data that follows + Format 2 bytes Format definition for this record + AlgID 2 bytes Encryption algorithm identifier + Bitlen 2 bytes Bit length of encryption key + Flags 2 bytes Processing flags + CertData TSize-8 Certificate decryption extra field data + (refer to the explanation for CertData + in the section describing the + Certificate Processing Method under + the Strong Encryption Specification) + + See the section describing the Strong Encryption Specification + for details. Refer to the section in this document entitled + "Incorporating PKWARE Proprietary Technology into Your Product" + for more information. + + ### 4.5.13 + -Record Management Controls (0x0018): + + Value Size Description + ----- ---- ----------- +(Rec-CTL) 0x0018 2 bytes Tag for this "extra" block type + CSize 2 bytes Size of total extra block data + Tag1 2 bytes Record control attribute 1 + Size1 2 bytes Size of attribute 1, in bytes + Data1 Size1 Attribute 1 data + . + . + . + TagN 2 bytes Record control attribute N + SizeN 2 bytes Size of attribute N, in bytes + DataN SizeN Attribute N data + + + ### 4.5.14 + -PKCS#7 Encryption Recipient Certificate List (0x0019): + + This field MAY contain information about each of the certificates + used in encryption processing and it can be used to identify who is + allowed to decrypt encrypted files. This field SHOULD only appear + in the archive extra data record. This field is not required and + serves only to aid archive modifications by preserving public + encryption key data. Individual security requirements may dictate + that this data be omitted to deter information exposure. + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- +(CStore) 0x0019 2 bytes Tag for this "extra" block type + TSize 2 bytes Size of the store data + TData TSize Data about the store + + TData: + + Value Size Description + ----- ---- ----------- + Version 2 bytes Format version number - MUST be 0x0001 at this time + CStore (var) PKCS#7 data blob + + See the section describing the Strong Encryption Specification + for details. Refer to the section in this document entitled + "Incorporating PKWARE Proprietary Technology into Your Product" + for more information. + + ### 4.5.15 + -MVS Extra Field (0x0065): + + The following is the layout of the MVS "extra" block. + Note: Some fields are stored in Big Endian format. + All text is in EBCDIC format unless otherwise specified. +Value Size Description + ----- ---- ----------- +(MVS) 0x0065 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + ID 4 bytes EBCDIC "Z390" 0xE9F3F9F0 or + "T4MV" for TargetFour + (var) TSize-4 Attribute data (see APPENDIX B) + + + ### 4.5.16 + -OS/400 Extra Field (0x0065): + + The following is the layout of the OS/400 "extra" block. + Note: Some fields are stored in Big Endian format. + All text is in EBCDIC format unless otherwise specified. + + Value Size Description + ----- ---- ----------- +(OS400) 0x0065 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + ID 4 bytes EBCDIC "I400" 0xC9F4F0F0 or + "T4MV" for TargetFour + (var) TSize-4 Attribute data (see APPENDIX A) + + ### 4.5.17 + -Policy Decryption Key Record Extra Field (0x0021): + + The following is the layout of the Policy Decryption Key "extra" block. + TData is a variable length, variable content field. It holds + information about encryptions and/or encryption key sources. + Contact PKWARE for information on current TData structures. + Information in this "extra" block may aternatively be placed + within comment fields. Refer to the section in this document + entitled "Incorporating PKWARE Proprietary Technology into Your + Product" for more information. + + Value Size Description + ----- ---- ----------- + 0x0021 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + TData TSize Data about the key + + ### 4.5.18 + -Key Provider Record Extra Field (0x0022): + + The following is the layout of the Key Provider "extra" block. + TData is a variable length, variable content field. It holds + information about encryptions and/or encryption key sources. + Contact PKWARE for information on current TData structures. + Information in this "extra" block may aternatively be placed + within comment fields. Refer to the section in this document + entitled "Incorporating PKWARE Proprietary Technology into Your + Product" for more information. + + Value Size Description + ----- ---- ----------- + 0x0022 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + TData TSize Data about the key + + ### 4.5.19 + -Policy Key Data Record Record Extra Field (0x0023): + + The following is the layout of the Policy Key Data "extra" block. + TData is a variable length, variable content field. It holds + information about encryptions and/or encryption key sources. + Contact PKWARE for information on current TData structures. + Information in this "extra" block may aternatively be placed + within comment fields. Refer to the section in this document + entitled "Incorporating PKWARE Proprietary Technology into Your + Product" for more information. + + Value Size Description + ----- ---- ----------- + 0x0023 2 bytes Tag for this "extra" block type + TSize 2 bytes Size for the following data block + TData TSize Data about the key + +## 4.6 Third Party Mappings + + ### 4.6.1 + Third party mappings commonly used are: + + 0x07c8 Macintosh + 0x2605 ZipIt Macintosh + 0x2705 ZipIt Macintosh 1.3.5+ + 0x2805 ZipIt Macintosh 1.3.5+ + 0x334d Info-ZIP Macintosh + 0x4341 Acorn/SparkFS + 0x4453 Windows NT security descriptor (binary ACL) + 0x4704 VM/CMS + 0x470f MVS + 0x4b46 FWKCS MD5 (see below) + 0x4c41 OS/2 access control list (text ACL) + 0x4d49 Info-ZIP OpenVMS + 0x4f4c Xceed original location extra field + 0x5356 AOS/VS (ACL) + 0x5455 extended timestamp + 0x554e Xceed unicode extra field + 0x5855 Info-ZIP UNIX (original, also OS/2, NT, etc) + 0x6375 Info-ZIP Unicode Comment Extra Field + 0x6542 BeOS/BeBox + 0x7075 Info-ZIP Unicode Path Extra Field + 0x756e ASi UNIX + 0x7855 Info-ZIP UNIX (new) + 0xa11e Data Stream Alignment (Apache Commons-Compress) + 0xa220 Microsoft Open Packaging Growth Hint + 0xfd4a SMS/QDOS + 0x9901 AE-x encryption structure (see APPENDIX E) + 0x9902 unknown + + + Detailed descriptions of Extra Fields defined by third + party mappings will be documented as information on + these data structures is made available to PKWARE. + PKWARE does not guarantee the accuracy of any published + third party data. + + ### 4.6.2 + Third-party Extra Fields MUST include a Header ID using + the format defined in the section of this document + titled Extensible Data Fields (section 4.5). + + The Data Size field indicates the size of the following + data block. Programs can use this value to skip to the + next header block, passing over any data blocks that are + not of interest. + + Note: As stated above, the size of the entire .ZIP file + header, including the file name, comment, and extra + field SHOULD NOT exceed 64K in size. + + ### 4.6.3 + In case two different programs appropriate the same + Header ID value, it is strongly recommended that each + program SHOULD place a unique signature of at least two bytes in + size (and preferably 4 bytes or bigger) at the start of + each data area. Every program SHOULD verify that its + unique signature is present, in addition to the Header ID + value being correct, before assuming that it is a block of + known type. + + Third-party Mappings: + + ### 4.6.4 + -ZipIt Macintosh Extra Field (long) (0x2605): + + The following is the layout of the ZipIt extra block + for Macintosh. The local-header and central-header versions + are identical. This block MUST be present if the file is + stored MacBinary-encoded and it SHOULD NOT be used if the file + is not stored MacBinary-encoded. + + Value Size Description + ----- ---- ----------- + (Mac2) 0x2605 Short tag for this extra block type + TSize Short total data size for this block + "ZPIT" beLong extra-field signature + FnLen Byte length of FileName + FileName variable full Macintosh filename + FileType Byte[4] four-byte Mac file type string + Creator Byte[4] four-byte Mac creator string + + + ### 4.6.5 + -ZipIt Macintosh Extra Field (short, for files) (0x2705): + + The following is the layout of a shortened variant of the + ZipIt extra block for Macintosh (without "full name" entry). + This variant is used by ZipIt 1.3.5 and newer for entries of + files (not directories) that do not have a MacBinary encoded + file. The local-header and central-header versions are identical. + + Value Size Description + ----- ---- ----------- + (Mac2b) 0x2705 Short tag for this extra block type + TSize Short total data size for this block (12) + "ZPIT" beLong extra-field signature + FileType Byte[4] four-byte Mac file type string + Creator Byte[4] four-byte Mac creator string + fdFlags beShort attributes from FInfo.frFlags, + MAY be omitted + 0x0000 beShort reserved, MAY be omitted + + + ### 4.6.6 + -ZipIt Macintosh Extra Field (short, for directories) (0x2805): + + The following is the layout of a shortened variant of the + ZipIt extra block for Macintosh used only for directory + entries. This variant is used by ZipIt 1.3.5 and newer to + save some optional Mac-specific information about directories. + The local-header and central-header versions are identical. + + Value Size Description + ----- ---- ----------- + (Mac2c) 0x2805 Short tag for this extra block type + TSize Short total data size for this block (12) + "ZPIT" beLong extra-field signature + frFlags beShort attributes from DInfo.frFlags, MAY + be omitted + View beShort ZipIt view flag, MAY be omitted + + + The View field specifies ZipIt-internal settings as follows: + + Bits of the Flags: + bit 0 if set, the folder is shown expanded (open) + when the archive contents are viewed in ZipIt. + bits 1-15 reserved, zero; + + + ### 4.6.7 + -FWKCS MD5 Extra Field (0x4b46): + + The FWKCS Contents_Signature System, used in + automatically identifying files independent of file name, + optionally adds and uses an extra field to support the + rapid creation of an enhanced contents_signature: + + Header ID = 0x4b46 + Data Size = 0x0013 + Preface = 'M','D','5' + followed by 16 bytes containing the uncompressed file's + 128_bit MD5 hash(1), low byte first. + + When FWKCS revises a .ZIP file central directory to add + this extra field for a file, it also replaces the + central directory entry for that file's uncompressed + file length with a measured value. + + FWKCS provides an option to strip this extra field, if + present, from a .ZIP file central directory. In adding + this extra field, FWKCS preserves .ZIP file Authenticity + Verification; if stripping this extra field, FWKCS + preserves all versions of AV through PKZIP version 2.04g. + + FWKCS, and FWKCS Contents_Signature System, are + trademarks of Frederick W. Kantor. + + (1) R. Rivest, RFC1321.TXT, MIT Laboratory for Computer + Science and RSA Data Security, Inc., April 1992. + ll.76-77: "The MD5 algorithm is being placed in the + public domain for review and possible adoption as a + standard." + + + ### 4.6.8 + -Info-ZIP Unicode Comment Extra Field (0x6375): + + Stores the UTF-8 version of the file comment as stored in the + central directory header. (Last Revision 20070912) + + Value Size Description + ----- ---- ----------- + (UCom) 0x6375 Short tag for this extra block type ("uc") + TSize Short total data size for this block + Version 1 byte version of this extra field, currently 1 + ComCRC32 4 bytes Comment Field CRC32 Checksum + UnicodeCom Variable UTF-8 version of the entry comment + + Currently Version is set to the number 1. If there is a need + to change this field, the version will be incremented. Changes + MAY NOT be backward compatible so this extra field SHOULD NOT be + used if the version is not recognized. + + The ComCRC32 is the standard zip CRC32 checksum of the File Comment + field in the central directory header. This is used to verify that + the comment field has not changed since the Unicode Comment extra field + was created. This can happen if a utility changes the File Comment + field but does not update the UTF-8 Comment extra field. If the CRC + check fails, this Unicode Comment extra field SHOULD be ignored and + the File Comment field in the header SHOULD be used instead. + + The UnicodeCom field is the UTF-8 version of the File Comment field + in the header. As UnicodeCom is defined to be UTF-8, no UTF-8 byte + order mark (BOM) is used. The length of this field is determined by + subtracting the size of the previous fields from TSize. If both the + File Name and Comment fields are UTF-8, the new General Purpose Bit + Flag, bit 11 (Language encoding flag (EFS)), can be used to indicate + both the header File Name and Comment fields are UTF-8 and, in this + case, the Unicode Path and Unicode Comment extra fields are not + needed and SHOULD NOT be created. Note that, for backward + compatibility, bit 11 SHOULD only be used if the native character set + of the paths and comments being zipped up are already in UTF-8. It is + expected that the same file comment storage method, either general + purpose bit 11 or extra fields, be used in both the Local and Central + Directory Header for a file. + + + ### 4.6.9 + -Info-ZIP Unicode Path Extra Field (0x7075): + + Stores the UTF-8 version of the file name field as stored in the + local header and central directory header. (Last Revision 20070912) + + Value Size Description + ----- ---- ----------- + (UPath) 0x7075 Short tag for this extra block type ("up") + TSize Short total data size for this block + Version 1 byte version of this extra field, currently 1 + NameCRC32 4 bytes File Name Field CRC32 Checksum + UnicodeName Variable UTF-8 version of the entry File Name + + Currently Version is set to the number 1. If there is a need + to change this field, the version will be incremented. Changes + MAY NOT be backward compatible so this extra field SHOULD NOT be + used if the version is not recognized. + + The NameCRC32 is the standard zip CRC32 checksum of the File Name + field in the header. This is used to verify that the header + File Name field has not changed since the Unicode Path extra field + was created. This can happen if a utility renames the File Name but + does not update the UTF-8 path extra field. If the CRC check fails, + this UTF-8 Path Extra Field SHOULD be ignored and the File Name field + in the header SHOULD be used instead. + + The UnicodeName is the UTF-8 version of the contents of the File Name + field in the header. As UnicodeName is defined to be UTF-8, no UTF-8 + byte order mark (BOM) is used. The length of this field is determined + by subtracting the size of the previous fields from TSize. If both + the File Name and Comment fields are UTF-8, the new General Purpose + Bit Flag, bit 11 (Language encoding flag (EFS)), can be used to + indicate that both the header File Name and Comment fields are UTF-8 + and, in this case, the Unicode Path and Unicode Comment extra fields + are not needed and SHOULD NOT be created. Note that, for backward + compatibility, bit 11 SHOULD only be used if the native character set + of the paths and comments being zipped up are already in UTF-8. It is + expected that the same file name storage method, either general + purpose bit 11 or extra fields, be used in both the Local and Central + Directory Header for a file. + + + ### 4.6.10 + -Microsoft Open Packaging Growth Hint (0xa220): + + Value Size Description + ----- ---- ----------- + 0xa220 Short tag for this extra block type + TSize Short size of Sig + PadVal + Padding + Sig Short verification signature (A028) + PadVal Short Initial padding value + Padding variable filled with NULL characters + + ### 4.6.11 + -Data Stream Alignment (Apache Commons-Compress) (0xa11e): + + (per Zbynek Vyskovsky) Defines alignment of data stream of this + entry within the zip archive. Additionally, indicates whether the + compression method should be kept when re-compressing the zip file. + + The purpose of this extra field is to align specific resources to + word or page boundaries so they can be easily mapped into memory. + + Value Size Description + ----- ---- ----------- + 0xa11e Short tag for this extra block type + TSize Short total data size for this block (2+padding) + alignment Short required alignment and indicator + 0x00 Variable padding + + The alignment field (lower 15 bits) defines the minimal alignment + required by the data stream. Bit 15 of alignment field indicates + whether the compression method of this entry can be changed when + recompressing the zip file. The value 0 means the compression method + should not be changed. The value 1 indicates the compression method + may be changed. The padding field contains padding to ensure the correct + alignment. It can be changed at any time when the offset or required + alignment changes. (see https://issues.apache.org/jira/browse/COMPRESS-391) + + +## 4.7 Manifest Files + +### 4.7.1 + Applications using ZIP files MAY have a need for additional + information that MUST be included with the files placed into + a ZIP file. Application specific information that cannot be + stored using the defined ZIP storage records SHOULD be stored + using the extensible Extra Field convention defined in this + document. However, some applications MAY use a manifest + file as a means for storing additional information. One + example is the META-INF/MANIFEST.MF file used in ZIP formatted + files having the .JAR extension (JAR files). + +### 4.7.2 + A manifest file is a file created for the application process + that requires this information. A manifest file MAY be of any + file type required by the defining application process. It is + placed within the same ZIP file as files to which this information + applies. By convention, this file is typically the first file placed + into the ZIP file and it MAY include a defined directory path. + +### 4.7.3 + Manifest files MAY be compressed or encrypted as needed for + application processing of the files inside the ZIP files. + + Manifest files are outside of the scope of this specification. + + +## 5.0 Explanation of compression methods + + +## 5.1 UnShrinking - Method 1 + +### 5.1.1 + Shrinking is a Dynamic Ziv-Lempel-Welch compression algorithm + with partial clearing. The initial code size is 9 bits, and the + maximum code size is 13 bits. Shrinking differs from conventional + Dynamic Ziv-Lempel-Welch implementations in several respects: + +### 5.1.2 + The code size is controlled by the compressor, and is + not automatically increased when codes larger than the current + code size are created (but not necessarily used). When + the decompressor encounters the code sequence 256 + (decimal) followed by 1, it SHOULD increase the code size + read from the input stream to the next bit size. No + blocking of the codes is performed, so the next code at + the increased size SHOULD be read from the input stream + immediately after where the previous code at the smaller + bit size was read. Again, the decompressor SHOULD NOT + increase the code size used until the sequence 256,1 is + encountered. + +### 5.1.3 + When the table becomes full, total clearing is not + performed. Rather, when the compressor emits the code + sequence 256,2 (decimal), the decompressor SHOULD clear + all leaf nodes from the Ziv-Lempel tree, and continue to + use the current code size. The nodes that are cleared + from the Ziv-Lempel tree are then re-used, with the lowest + code value re-used first, and the highest code value + re-used last. The compressor can emit the sequence 256,2 + at any time. + +## 5.2 Expanding - Methods 2-5 + +### 5.2.1 + The Reducing algorithm is actually a combination of two + distinct algorithms. The first algorithm compresses repeated + byte sequences, and the second algorithm takes the compressed + stream from the first algorithm and applies a probabilistic + compression method. + +### 5.2.2 + The probabilistic compression stores an array of 'follower + sets' S(j), for j=0 to 255, corresponding to each possible + ASCII character. Each set contains between 0 and 32 + characters, to be denoted as S(j)[0],...,S(j)[m], where m<32. + The sets are stored at the beginning of the data area for a + Reduced file, in reverse order, with S(255) first, and S(0) + last. + +### 5.2.3 + The sets are encoded as { N(j), S(j)[0],...,S(j)[N(j)-1] }, + where N(j) is the size of set S(j). N(j) can be 0, in which + case the follower set for S(j) is empty. Each N(j) value is + encoded in 6 bits, followed by N(j) eight bit character values + corresponding to S(j)[0] to S(j)[N(j)-1] respectively. If + N(j) is 0, then no values for S(j) are stored, and the value + for N(j-1) immediately follows. + +### 5.2.4 + Immediately after the follower sets, is the compressed data + stream. The compressed data stream can be interpreted for the + probabilistic decompression as follows: + + let Last-Character <- 0. + loop until done + if the follower set S(Last-Character) is empty then + read 8 bits from the input stream, and copy this + value to the output stream. + otherwise if the follower set S(Last-Character) is non-empty then + read 1 bit from the input stream. + if this bit is not zero then + read 8 bits from the input stream, and copy this + value to the output stream. + otherwise if this bit is zero then + read B(N(Last-Character)) bits from the input + stream, and assign this value to I. + Copy the value of S(Last-Character)[I] to the + output stream. + + assign the last value placed on the output stream to + Last-Character. + end loop + + B(N(j)) is defined as the minimal number of bits required to + encode the value N(j)-1. + +### 5.2.5 + The decompressed stream from above can then be expanded to + re-create the original file as follows: + + let State <- 0. + + loop until done + read 8 bits from the input stream into C. + case State of + 0: if C is not equal to DLE (144 decimal) then + copy C to the output stream. + otherwise if C is equal to DLE then + let State <- 1. + + 1: if C is non-zero then + let V <- C. + let Len <- L(V) + let State <- F(Len). + otherwise if C is zero then + copy the value 144 (decimal) to the output stream. + let State <- 0 + + 2: let Len <- Len + C + let State <- 3. + + 3: move backwards D(V,C) bytes in the output stream + (if this position is before the start of the output + stream, then assume that all the data before the + start of the output stream is filled with zeros). + copy Len+3 bytes from this position to the output stream. + let State <- 0. + end case + end loop + + The functions F,L, and D are dependent on the 'compression + factor', 1 through 4, and are defined as follows: + + For compression factor 1: + L(X) equals the lower 7 bits of X. + F(X) equals 2 if X equals 127 otherwise F(X) equals 3. + D(X,Y) equals the (upper 1 bit of X) * 256 + Y + 1. + For compression factor 2: + L(X) equals the lower 6 bits of X. + F(X) equals 2 if X equals 63 otherwise F(X) equals 3. + D(X,Y) equals the (upper 2 bits of X) * 256 + Y + 1. + For compression factor 3: + L(X) equals the lower 5 bits of X. + F(X) equals 2 if X equals 31 otherwise F(X) equals 3. + D(X,Y) equals the (upper 3 bits of X) * 256 + Y + 1. + For compression factor 4: + L(X) equals the lower 4 bits of X. + F(X) equals 2 if X equals 15 otherwise F(X) equals 3. + D(X,Y) equals the (upper 4 bits of X) * 256 + Y + 1. + +## 5.3 Imploding - Method 6 + +### 5.3.1 + The Imploding algorithm is actually a combination of two + distinct algorithms. The first algorithm compresses repeated byte + sequences using a sliding dictionary. The second algorithm is + used to compress the encoding of the sliding dictionary output, + using multiple Shannon-Fano trees. + +### 5.3.2 + The Imploding algorithm can use a 4K or 8K sliding dictionary + size. The dictionary size used can be determined by bit 1 in the + general purpose flag word; a 0 bit indicates a 4K dictionary + while a 1 bit indicates an 8K dictionary. + +### 5.3.3 + The Shannon-Fano trees are stored at the start of the + compressed file. The number of trees stored is defined by bit 2 in + the general purpose flag word; a 0 bit indicates two trees stored, + a 1 bit indicates three trees are stored. If 3 trees are stored, + the first Shannon-Fano tree represents the encoding of the + Literal characters, the second tree represents the encoding of + the Length information, the third represents the encoding of the + Distance information. When 2 Shannon-Fano trees are stored, the + Length tree is stored first, followed by the Distance tree. + +### 5.3.4 + The Literal Shannon-Fano tree, if present is used to represent + the entire ASCII character set, and contains 256 values. This + tree is used to compress any data not compressed by the sliding + dictionary algorithm. When this tree is present, the Minimum + Match Length for the sliding dictionary is 3. If this tree is + not present, the Minimum Match Length is 2. + +### 5.3.5 + The Length Shannon-Fano tree is used to compress the Length + part of the (length,distance) pairs from the sliding dictionary + output. The Length tree contains 64 values, ranging from the + Minimum Match Length, to 63 plus the Minimum Match Length. + +### 5.3.6 + The Distance Shannon-Fano tree is used to compress the Distance + part of the (length,distance) pairs from the sliding dictionary + output. The Distance tree contains 64 values, ranging from 0 to + 63, representing the upper 6 bits of the distance value. The + distance values themselves will be between 0 and the sliding + dictionary size, either 4K or 8K. + +### 5.3.7 + The Shannon-Fano trees themselves are stored in a compressed + format. The first byte of the tree data represents the number of + bytes of data representing the (compressed) Shannon-Fano tree + minus 1. The remaining bytes represent the Shannon-Fano tree + data encoded as: + + High 4 bits: Number of values at this bit length + 1. (1 - 16) + Low 4 bits: Bit Length needed to represent value + 1. (1 - 16) + +### 5.3.8 + The Shannon-Fano codes can be constructed from the bit lengths + using the following algorithm: + + 1) Sort the Bit Lengths in ascending order, while retaining the + order of the original lengths stored in the file. + + 2) Generate the Shannon-Fano trees: + + Code <- 0 + CodeIncrement <- 0 + LastBitLength <- 0 + i <- number of Shannon-Fano codes - 1 (either 255 or 63) + + loop while i >= 0 + Code = Code + CodeIncrement + if BitLength(i) <> LastBitLength then + LastBitLength=BitLength(i) + CodeIncrement = 1 shifted left (16 - LastBitLength) + ShannonCode(i) = Code + i <- i - 1 + end loop + + 3) Reverse the order of all the bits in the above ShannonCode() + vector, so that the most significant bit becomes the least + significant bit. For example, the value 0x1234 (hex) would + become 0x2C48 (hex). + + 4) Restore the order of Shannon-Fano codes as originally stored + within the file. + + Example: + + This example will show the encoding of a Shannon-Fano tree + of size 8. Notice that the actual Shannon-Fano trees used + for Imploding are either 64 or 256 entries in size. + + Example: 0x02, 0x42, 0x01, 0x13 + + The first byte indicates 3 values in this table. Decoding the + bytes: + 0x42 = 5 codes of 3 bits long + 0x01 = 1 code of 2 bits long + 0x13 = 2 codes of 4 bits long + + This would generate the original bit length array of: + (3, 3, 3, 3, 3, 2, 4, 4) + + There are 8 codes in this table for the values 0 thru 7. Using + the algorithm to obtain the Shannon-Fano codes produces: + + Reversed Order Original + Val Sorted Constructed Code Value Restored Length + --- ------ ----------------- -------- -------- ------ + 0: 2 1100000000000000 11 101 3 + 1: 3 1010000000000000 101 001 3 + 2: 3 1000000000000000 001 110 3 + 3: 3 0110000000000000 110 010 3 + 4: 3 0100000000000000 010 100 3 + 5: 3 0010000000000000 100 11 2 + 6: 4 0001000000000000 1000 1000 4 + 7: 4 0000000000000000 0000 0000 4 + + The values in the Val, Order Restored and Original Length columns + now represent the Shannon-Fano encoding tree that can be used for + decoding the Shannon-Fano encoded data. How to parse the + variable length Shannon-Fano values from the data stream is beyond + the scope of this document. (See the references listed at the end of + this document for more information.) However, traditional decoding + schemes used for Huffman variable length decoding, such as the + Greenlaw algorithm, can be successfully applied. + +### 5.3.9 + The compressed data stream begins immediately after the + compressed Shannon-Fano data. The compressed data stream can be + interpreted as follows: + + loop until done + read 1 bit from input stream. + + if this bit is non-zero then (encoded data is literal data) + if Literal Shannon-Fano tree is present + read and decode character using Literal Shannon-Fano tree. + otherwise + read 8 bits from input stream. + copy character to the output stream. + otherwise (encoded data is sliding dictionary match) + if 8K dictionary size + read 7 bits for offset Distance (lower 7 bits of offset). + otherwise + read 6 bits for offset Distance (lower 6 bits of offset). + + using the Distance Shannon-Fano tree, read and decode the + upper 6 bits of the Distance value. + + using the Length Shannon-Fano tree, read and decode + the Length value. + + Length <- Length + Minimum Match Length + + if Length = 63 + Minimum Match Length + read 8 bits from the input stream, + add this value to Length. + + move backwards Distance+1 bytes in the output stream, and + copy Length characters from this position to the output + stream. (if this position is before the start of the output + stream, then assume that all the data before the start of + the output stream is filled with zeros). + end loop + +## 5.4 Tokenizing - Method 7 + +### 5.4.1 + This method is not used by PKZIP. + +## 5.5 Deflating - Method 8 + +### 5.5.1 + The Deflate algorithm is similar to the Implode algorithm using + a sliding dictionary of up to 32K with secondary compression + from Huffman/Shannon-Fano codes. + +### 5.5.2 + The compressed data is stored in blocks with a header describing + the block and the Huffman codes used in the data block. The header + format is as follows: + + Bit 0: Last Block bit This bit is set to 1 if this is the last + compressed block in the data. + Bits 1-2: Block type + 00 (0) - Block is stored - All stored data is byte aligned. + Skip bits until next byte, then next word = block + length, followed by the ones compliment of the block + length word. Remaining data in block is the stored + data. + + 01 (1) - Use fixed Huffman codes for literal and distance codes. + Lit Code Bits Dist Code Bits + --------- ---- --------- ---- + 0 - 143 8 0 - 31 5 + 144 - 255 9 + 256 - 279 7 + 280 - 287 8 + + Literal codes 286-287 and distance codes 30-31 are + never used but participate in the huffman construction. + + 10 (2) - Dynamic Huffman codes. (See expanding Huffman codes) + + 11 (3) - Reserved - Flag a "Error in compressed data" if seen. + +### 5.5.3 + Expanding Huffman Codes + + If the data block is stored with dynamic Huffman codes, the Huffman + codes are sent in the following compressed format: + + 5 Bits: # of Literal codes sent - 256 (256 - 286) + All other codes are never sent. + 5 Bits: # of Dist codes - 1 (1 - 32) + 4 Bits: # of Bit Length codes - 3 (3 - 19) + + The Huffman codes are sent as bit lengths and the codes are built as + described in the implode algorithm. The bit lengths themselves are + compressed with Huffman codes. There are 19 bit length codes: + + 0 - 15: Represent bit lengths of 0 - 15 + 16: Copy the previous bit length 3 - 6 times. + The next 2 bits indicate repeat length (0 = 3, ... ,3 = 6) + Example: Codes 8, 16 (+2 bits 11), 16 (+2 bits 10) will + expand to 12 bit lengths of 8 (1 + 6 + 5) + 17: Repeat a bit length of 0 for 3 - 10 times. (3 bits of length) + 18: Repeat a bit length of 0 for 11 - 138 times (7 bits of length) + + The lengths of the bit length codes are sent packed 3 bits per value + (0 - 7) in the following order: + + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + + The Huffman codes SHOULD be built as described in the Implode algorithm + except codes are assigned starting at the shortest bit length, i.e. the + shortest code SHOULD be all 0's rather than all 1's. Also, codes with + a bit length of zero do not participate in the tree construction. The + codes are then used to decode the bit lengths for the literal and + distance tables. + + The bit lengths for the literal tables are sent first with the number + of entries sent described by the 5 bits sent earlier. There are up + to 286 literal characters; the first 256 represent the respective 8 + bit character, code 256 represents the End-Of-Block code, the remaining + 29 codes represent copy lengths of 3 thru 258. There are up to 30 + distance codes representing distances from 1 thru 32k as described + below. + + Length Codes + ------------ + Extra Extra Extra Extra + Code Bits Length Code Bits Lengths Code Bits Lengths Code Bits Length(s) + ---- ---- ------ ---- ---- ------- ---- ---- ------- ---- ---- --------- + 257 0 3 265 1 11,12 273 3 35-42 281 5 131-162 + 258 0 4 266 1 13,14 274 3 43-50 282 5 163-194 + 259 0 5 267 1 15,16 275 3 51-58 283 5 195-226 + 260 0 6 268 1 17,18 276 3 59-66 284 5 227-257 + 261 0 7 269 2 19-22 277 4 67-82 285 0 258 + 262 0 8 270 2 23-26 278 4 83-98 + 263 0 9 271 2 27-30 279 4 99-114 + 264 0 10 272 2 31-34 280 4 115-130 + + Distance Codes + -------------- + Extra Extra Extra Extra + Code Bits Dist Code Bits Dist Code Bits Distance Code Bits Distance + ---- ---- ---- ---- ---- ------ ---- ---- -------- ---- ---- -------- + 0 0 1 8 3 17-24 16 7 257-384 24 11 4097-6144 + 1 0 2 9 3 25-32 17 7 385-512 25 11 6145-8192 + 2 0 3 10 4 33-48 18 8 513-768 26 12 8193-12288 + 3 0 4 11 4 49-64 19 8 769-1024 27 12 12289-16384 + 4 1 5,6 12 5 65-96 20 9 1025-1536 28 13 16385-24576 + 5 1 7,8 13 5 97-128 21 9 1537-2048 29 13 24577-32768 + 6 2 9-12 14 6 129-192 22 10 2049-3072 + 7 2 13-16 15 6 193-256 23 10 3073-4096 + +### 5.5.4 + The compressed data stream begins immediately after the + compressed header data. The compressed data stream can be + interpreted as follows: + + do + read header from input stream. + + if stored block + skip bits until byte aligned + read count and 1's compliment of count + copy count bytes data block + otherwise + loop until end of block code sent + decode literal character from input stream + if literal < 256 + copy character to the output stream + otherwise + if literal = end of block + break from loop + otherwise + decode distance from input stream + + move backwards distance bytes in the output stream, and + copy length characters from this position to the output + stream. + end loop + while not last block + + if data descriptor exists + skip bits until byte aligned + read crc and sizes + endif + +## 5.6 Enhanced Deflating - Method 9 + +### 5.6.1 + The Enhanced Deflating algorithm is similar to Deflate but uses + a sliding dictionary of up to 64K. Deflate64(tm) is supported + by the Deflate extractor. + +## 5.7 BZIP2 - Method 12 + +### 5.7.1 + BZIP2 is an open-source data compression algorithm developed by + Julian Seward. Information and source code for this algorithm + can be found on the internet. + +## 5.8 LZMA - Method 14 + +### 5.8.1 + LZMA is a block-oriented, general purpose data compression + algorithm developed and maintained by Igor Pavlov. It is a derivative + of LZ77 that utilizes Markov chains and a range coder. Information and + source code for this algorithm can be found on the internet. Consult + with the author of this algorithm for information on terms or + restrictions on use. + + Support for LZMA within the ZIP format is defined as follows: + +### 5.8.2 + The Compression method field within the ZIP Local and Central + Header records will be set to the value 14 to indicate data was + compressed using LZMA. + +### 5.8.3 + The Version needed to extract field within the ZIP Local and + Central Header records will be set to 6.3 to indicate the minimum + ZIP format version supporting this feature. + +### 5.8.4 + File data compressed using the LZMA algorithm MUST be placed + immediately following the Local Header for the file. If a standard + ZIP encryption header is required, it will follow the Local Header + and will precede the LZMA compressed file data segment. The location + of LZMA compressed data segment within the ZIP format will be as shown: + + [local header file 1] + [encryption header file 1] + [LZMA compressed data segment for file 1] + [data descriptor 1] + [local header file 2] + +### 5.8.5 + The encryption header and data descriptor records MAY + be conditionally present. The LZMA Compressed Data Segment + will consist of an LZMA Properties Header followed by the + LZMA Compressed Data as shown: + + [LZMA properties header for file 1] + [LZMA compressed data for file 1] + +### 5.8.6 + The LZMA Compressed Data will be stored as provided by the + LZMA compression library. Compressed size, uncompressed size and + other file characteristics about the file being compressed MUST be + stored in standard ZIP storage format. + +### 5.8.7 + The LZMA Properties Header will store specific data required + to decompress the LZMA compressed Data. This data is set by the + LZMA compression engine using the function WriteCoderProperties() + as documented within the LZMA SDK. + +### 5.8.8 + Storage fields for the property information within the LZMA + Properties Header are as follows: + + LZMA Version Information 2 bytes + LZMA Properties Size 2 bytes + LZMA Properties Data variable, defined by "LZMA Properties Size" + + ### 5.8.8.1 +LZMA Version Information - this field identifies which version + of the LZMA SDK was used to compress a file. The first byte will + store the major version number of the LZMA SDK and the second + byte will store the minor number. + + ### 5.8.8.2 +LZMA Properties Size - this field defines the size of the + remaining property data. Typically this size SHOULD be determined by + the version of the SDK. This size field is included as a convenience + and to help avoid any ambiguity arising in the future due + to changes in this compression algorithm. + + ### 5.8.8.3 +LZMA Property Data - this variable sized field records the + required values for the decompressor as defined by the LZMA SDK. + The data stored in this field SHOULD be obtained using the + WriteCoderProperties() in the version of the SDK defined by + the "LZMA Version Information" field. + + ### 5.8.8.4 +The layout of the "LZMA Properties Data" field is a function of + the LZMA compression algorithm. It is possible that this layout MAY be + changed by the author over time. The data layout in version 4.3 of the + LZMA SDK defines a 5 byte array that uses 4 bytes to store the dictionary + size in little-endian order. This is preceded by a single packed byte as + the first element of the array that contains the following fields: + + PosStateBits + LiteralPosStateBits + LiteralContextBits + + Refer to the LZMA documentation for a more detailed explanation of + these fields. + +### 5.8.9 + Data compressed with method 14, LZMA, MAY include an end-of-stream + (EOS) marker ending the compressed data stream. This marker is not + required, but its use is highly recommended to facilitate processing + and implementers SHOULD include the EOS marker whenever possible. + When the EOS marker is used, general purpose bit 1 MUSY be set. If + general purpose bit 1 is not set, the EOS marker is not present. + +## 5.9 WavPack - Method 97 + +### 5.9.1 + Information describing the use of compression method 97 is + provided by WinZIP International, LLC. This method relies on the + open source WavPack audio compression utility developed by David Bryant. + Information on WavPack is available at www.wavpack.com. Please consult + with the author of this algorithm for information on terms and + restrictions on use. + +### 5.9.2 + WavPack data for a file begins immediately after the end of the + local header data. This data is the output from WavPack compression + routines. Within the ZIP file, the use of WavPack compression is + indicated by setting the compression method field to a value of 97 + in both the local header and the central directory header. The Version + needed to extract and version made by fields use the same values as are + used for data compressed using the Deflate algorithm. + +### 5.9.3 + An implementation note for storing digital sample data when using + WavPack compression within ZIP files is that all of the bytes of + the sample data SHOULD be compressed. This includes any unused + bits up to the byte boundary. An example is a 2 byte sample that + uses only 12 bits for the sample data with 4 unused bits. If only + 12 bits are passed as the sample size to the WavPack routines, the 4 + unused bits will be set to 0 on extraction regardless of their original + state. To avoid this, the full 16 bits of the sample data size + SHOULD be provided. + +## 5.10 PPMd - Method 98 + +### 5.10.1 + PPMd is a data compression algorithm developed by Dmitry Shkarin + which includes a carryless rangecoder developed by Dmitry Subbotin. + This algorithm is based on predictive phrase matching on multiple + order contexts. Information and source code for this algorithm + can be found on the internet. Consult with the author of this + algorithm for information on terms or restrictions on use. + +### 5.10.2 + Support for PPMd within the ZIP format currently is provided only + for version I, revision 1 of the algorithm. Storage requirements + for using this algorithm are as follows: + +### 5.10.3 + Parameters needed to control the algorithm are stored in the two + bytes immediately preceding the compressed data. These bytes are + used to store the following fields: + + Model order - sets the maximum model order, default is 8, possible + values are from 2 to 16 inclusive + + Sub-allocator size - sets the size of sub-allocator in MB, default is 50, + possible values are from 1MB to 256MB inclusive + + Model restoration method - sets the method used to restart context + model at memory insufficiency, values are: + + 0 - restarts model from scratch - default + 1 - cut off model - decreases performance by as much as 2x + 2 - freeze context tree - not recommended + +### 5.10.4 + An example for packing these fields into the 2 byte storage field is + illustrated below. These values are stored in Intel low-byte/high-byte + order. + + wPPMd = (Model order - 1) + + ((Sub-allocator size - 1) << 4) + + (Model restoration method << 12) + + +## 5.11 AE-x Encryption marker - Method 99 + +## 5.12 JPEG variant - Method 96 + +## 5.13 PKWARE Data Compression Library Imploding - Method 10 + +## 5.14 Reserved - Method 11 + +## 5.15 Reserved - Method 13 + +## 5.16 Reserved - Method 15 + +## 5.17 IBM z/OS CMPSC Compression - Method 16 + +Method 16 utilizes the IBM hardware compression facility available +on most IBM mainframes. Hardware compression can significantly +increase the speed of data compression. This method uses a variant +of the LZ78 algorithm. CMPSC hardware compression is performed +using the COMPRESSION CALL instruction. + +ZIP archives can be created using this method only on mainframes +supporting the CP instruction. Extraction MAY occur on any +platform supporting this compression algorithm. Use of this +algorithm requires creation of a compression dictionary and +an expansion dictionary. The expansion dictionary MUST be +placed into the ZIP archive for use on the system where +extraction will occur. + +Additional information on this compression algorithm and dictionaries +can be found in the IBM provided document titled IBM ESA/390 Data +Compression (SA22-7208-01). Storage requirements for using CMPSC +compression are as follows. + +The format for the compressed data stream placed into the ZIP +archive following the Local Header is: + + [dictionary header] + [expansion dictionary] + [CMPSC compressed data] + +If encryption is used to encrypt a file compressed with CMPSC, these +sections MUST be encrypted as a single entity. + +The format of the dictionary header is: + + Value Size Description + ----- ---- ----------- + Version 1 byte 1 + Flags/Symsize 1 byte Processing flags and + symbol size + DictionaryLen 4 bytes Length of the + expansion dictionary + +Explanation of processing flags and symbol size: + +The high 4 bits are used to store the processing flags. The low +4 bits represent the size of a symbol, in bits (values range +from 9-13). Flag values are defined below. + + 0x80 - expansion dictionary + 0x40 - expansion dictionary is compressed using Deflate + 0x20 - Reserved + 0x10 - Reserved + + +## 5.18 Reserved - Method 17 + +## 5.19 IBM TERSE - Method 18 + +## 5.20 IBM LZ77 z Architecture - Method 19 + +## 6.0 Traditional PKWARE Encryption + +### 6.0.1 + The following information discusses the decryption steps + required to support traditional PKWARE encryption. This + form of encryption is considered weak by today's standards + and its use is recommended only for situations with + low security needs or for compatibility with older .ZIP + applications. + +## 6.1 Traditional PKWARE Decryption + +### 6.1.1 + PKWARE is grateful to Mr. Roger Schlafly for his expert + contribution towards the development of PKWARE's traditional + encryption. + +### 6.1.2 + PKZIP encrypts the compressed data stream. Encrypted files + MUST be decrypted before they can be extracted to their original + form. + +### 6.1.3 + Each encrypted file has an extra 12 bytes stored at the start + of the data area defining the encryption header for that file. The + encryption header is originally set to random values, and then + itself encrypted, using three, 32-bit keys. The key values are + initialized using the supplied encryption password. After each byte + is encrypted, the keys are then updated using pseudo-random number + generation techniques in combination with the same CRC-32 algorithm + used in PKZIP and described elsewhere in this document. + +### 6.1.4 + The following are the basic steps required to decrypt a file: + + 1) Initialize the three 32-bit keys with the password. + 2) Read and decrypt the 12-byte encryption header, further + initializing the encryption keys. + 3) Read and decrypt the compressed data stream using the + encryption keys. + +### 6.1.5 + Initializing the encryption keys + + Key(0) <- 305419896 + Key(1) <- 591751049 + Key(2) <- 878082192 + + loop for i <- 0 to length(password)-1 + update_keys(password(i)) + end loop + + Where update_keys() is defined as: + + update_keys(char): + Key(0) <- crc32(key(0),char) + Key(1) <- Key(1) + (Key(0) & 000000ffH) + Key(1) <- Key(1) * 134775813 + 1 + Key(2) <- crc32(key(2),key(1) >> 24) + end update_keys + + Where crc32(old_crc,char) is a routine that given a CRC value and a + character, returns an updated CRC value after applying the CRC-32 + algorithm described elsewhere in this document. + + ### 6.1.6 + Decrypting the encryption header + + The purpose of this step is to further initialize the encryption + keys, based on random data, to render a plaintext attack on the + data ineffective. + + Read the 12-byte encryption header into Buffer, in locations + Buffer(0) thru Buffer(11). + + loop for i <- 0 to 11 + C <- buffer(i) ^ decrypt_byte() + update_keys(C) + buffer(i) <- C + end loop + + Where decrypt_byte() is defined as: + + unsigned char decrypt_byte() + local unsigned short temp + temp <- Key(2) | 2 + decrypt_byte <- (temp * (temp ^ 1)) >> 8 + end decrypt_byte + + After the header is decrypted, the last 1 or 2 bytes in Buffer + SHOULD be the high-order word/byte of the CRC for the file being + decrypted, stored in Intel low-byte/high-byte order. Versions of + PKZIP prior to 2.0 used a 2 byte CRC check; a 1 byte CRC check is + used on versions after 2.0. This can be used to test if the password + supplied is correct or not. + +### 6.1.7 + Decrypting the compressed data stream + + The compressed data stream can be decrypted as follows: + + loop until done + read a character into C + Temp <- C ^ decrypt_byte() + update_keys(temp) + output Temp + end loop + + +## 7.0 Strong Encryption Specification + +### 7.0.1 + Portions of the Strong Encryption technology defined in this + specification are covered under patents and pending patent applications. + Refer to the section in this document entitled "Incorporating + PKWARE Proprietary Technology into Your Product" for more information. + +## 7.1 Strong Encryption Overview + +### 7.1.1 + Version 5.x of this specification introduced support for strong + encryption algorithms. These algorithms can be used with either + a password or an X.509v3 digital certificate to encrypt each file. + This format specification supports either password or certificate + based encryption to meet the security needs of today, to enable + interoperability between users within both PKI and non-PKI + environments, and to ensure interoperability between different + computing platforms that are running a ZIP program. + +### 7.1.2 + Password based encryption is the most common form of encryption + people are familiar with. However, inherent weaknesses with + passwords (e.g. susceptibility to dictionary/brute force attack) + as well as password management and support issues make certificate + based encryption a more secure and scalable option. Industry + efforts and support are defining and moving towards more advanced + security solutions built around X.509v3 digital certificates and + Public Key Infrastructures(PKI) because of the greater scalability, + administrative options, and more robust security over traditional + password based encryption. + +### 7.1.3 + Most standard encryption algorithms are supported with this + specification. Reference implementations for many of these + algorithms are available from either commercial or open source + distributors. Readily available cryptographic toolkits make + implementation of the encryption features straight-forward. + This document is not intended to provide a treatise on data + encryption principles or theory. Its purpose is to document the + data structures required for implementing interoperable data + encryption within the .ZIP format. It is strongly recommended that + you have a good understanding of data encryption before reading + further. + +### 7.1.4 + The algorithms introduced in Version 5.0 of this specification + include: + + RC2 40 bit, 64 bit, and 128 bit + RC4 40 bit, 64 bit, and 128 bit + DES + 3DES 112 bit and 168 bit + + Version 5.1 adds support for the following: + + AES 128 bit, 192 bit, and 256 bit + + +### 7.1.5 + Version 6.1 introduces encryption data changes to support + interoperability with Smartcard and USB Token certificate storage + methods which do not support the OAEP strengthening standard. + +### 7.1.6 + Version 6.2 introduces support for encrypting metadata by compressing + and encrypting the central directory data structure to reduce information + leakage. Information leakage can occur in legacy ZIP applications + through exposure of information about a file even though that file is + stored encrypted. The information exposed consists of file + characteristics stored within the records and fields defined by this + specification. This includes data such as a file's name, its original + size, timestamp and CRC32 value. + +### 7.1.7 + Version 6.3 introduces support for encrypting data using the Blowfish + and Twofish algorithms. These are symmetric block ciphers developed + by Bruce Schneier. Blowfish supports using a variable length key from + 32 to 448 bits. Block size is 64 bits. Implementations SHOULD use 16 + rounds and the only mode supported within ZIP files is CBC. Twofish + supports key sizes 128, 192 and 256 bits. Block size is 128 bits. + Implementations SHOULD use 16 rounds and the only mode supported within + ZIP files is CBC. Information and source code for both Blowfish and + Twofish algorithms can be found on the internet. Consult with the author + of these algorithms for information on terms or restrictions on use. + +### 7.1.8 + Central Directory Encryption provides greater protection against + information leakage by encrypting the Central Directory structure and + by masking key values that are replicated in the unencrypted Local + Header. ZIP compatible programs that cannot interpret an encrypted + Central Directory structure cannot rely on the data in the corresponding + Local Header for decompression information. + +### 7.1.9 + Extra Field records that MAY contain information about a file that SHOULD + not be exposed SHOULD NOT be stored in the Local Header and SHOULD only + be written to the Central Directory where they can be encrypted. This + design currently does not support streaming. Information in the End of + Central Directory record, the Zip64 End of Central Directory Locator, + and the Zip64 End of Central Directory records are not encrypted. Access + to view data on files within a ZIP file with an encrypted Central Directory + requires the appropriate password or private key for decryption prior to + viewing any files, or any information about the files, in the archive. + +### 7.1.10 + Older ZIP compatible programs not familiar with the Central Directory + Encryption feature will no longer be able to recognize the Central + Directory and MAY assume the ZIP file is corrupt. Programs that + attempt streaming access using Local Headers will see invalid + information for each file. Central Directory Encryption need not be + used for every ZIP file. Its use is recommended for greater security. + ZIP files not using Central Directory Encryption SHOULD operate as + in the past. + +### 7.1.11 + This strong encryption feature specification is intended to provide for + scalable, cross-platform encryption needs ranging from simple password + encryption to authenticated public/private key encryption. + +### 7.1.12 + Encryption provides data confidentiality and privacy. It is + recommended that you combine X.509 digital signing with encryption + to add authentication and non-repudiation. + + +## 7.2 Single Password Symmetric Encryption Method + +### 7.2.1 + The Single Password Symmetric Encryption Method using strong + encryption algorithms operates similarly to the traditional + PKWARE encryption defined in this format. Additional data + structures are added to support the processing needs of the + strong algorithms. + + The Strong Encryption data structures are: + +### 7.2.2 + General Purpose Bits - Bits 0 and 6 of the General Purpose bit + flag in both local and central header records. Both bits set + indicates strong encryption. Bit 13, when set indicates the Central + Directory is encrypted and that selected fields in the Local Header + are masked to hide their actual value. + + +### 7.2.3 + Extra Field 0x0017 in central header only. + + Fields to consider in this record are: + + ### 7.2.3.1 +Format - the data format identifier for this record. The only + value allowed at this time is the integer value 2. + + ### 7.2.3.2 +AlgId - integer identifier of the encryption algorithm from the + following range + + 0x6601 - DES + 0x6602 - RC2 (version needed to extract < 5.2) + 0x6603 - 3DES 168 + 0x6609 - 3DES 112 + 0x660E - AES 128 + 0x660F - AES 192 + 0x6610 - AES 256 + 0x6702 - RC2 (version needed to extract >= 5.2) + 0x6720 - Blowfish + 0x6721 - Twofish + 0x6801 - RC4 + 0xFFFF - Unknown algorithm + + ### 7.2.3.3 +Bitlen - Explicit bit length of key + + 32 - 448 bits + + ### 7.2.3.4 +Flags - Processing flags needed for decryption + + 0x0001 - Password is required to decrypt + 0x0002 - Certificates only + 0x0003 - Password or certificate required to decrypt + + Values > 0x0003 reserved for certificate processing + + + ### 7.2.4 + Decryption header record preceding compressed file data. + + -Decryption Header: + + Value Size Description + ----- ---- ----------- + IVSize 2 bytes Size of initialization vector (IV) + IVData IVSize Initialization vector for this file + Size 4 bytes Size of remaining decryption header data + Format 2 bytes Format definition for this record + AlgID 2 bytes Encryption algorithm identifier + Bitlen 2 bytes Bit length of encryption key + Flags 2 bytes Processing flags + ErdSize 2 bytes Size of Encrypted Random Data + ErdData ErdSize Encrypted Random Data + Reserved1 4 bytes Reserved certificate processing data + Reserved2 (var) Reserved for certificate processing data + VSize 2 bytes Size of password validation data + VData VSize-4 Password validation data + VCRC32 4 bytes Standard ZIP CRC32 of password validation data + + ### 7.2.4.1 +IVData - The size of the IV SHOULD match the algorithm block size. + The IVData can be completely random data. If the size of + the randomly generated data does not match the block size + it SHOULD be complemented with zero's or truncated as + necessary. If IVSize is 0,then IV = CRC32 + Uncompressed + File Size (as a 64 bit little-endian, unsigned integer value). + + ### 7.2.4.2 +Format - the data format identifier for this record. The only + value allowed at this time is the integer value 3. + + ### 7.2.4.3 +AlgId - integer identifier of the encryption algorithm from the + following range + + 0x6601 - DES + 0x6602 - RC2 (version needed to extract < 5.2) + 0x6603 - 3DES 168 + 0x6609 - 3DES 112 + 0x660E - AES 128 + 0x660F - AES 192 + 0x6610 - AES 256 + 0x6702 - RC2 (version needed to extract >= 5.2) + 0x6720 - Blowfish + 0x6721 - Twofish + 0x6801 - RC4 + 0xFFFF - Unknown algorithm + + ### 7.2.4.4 +Bitlen - Explicit bit length of key + + 32 - 448 bits + + ### 7.2.4.5 +Flags - Processing flags needed for decryption + + 0x0001 - Password is required to decrypt + 0x0002 - Certificates only + 0x0003 - Password or certificate required to decrypt + + Values > 0x0003 reserved for certificate processing + + ### 7.2.4.6 +ErdData - Encrypted random data is used to store random data that + is used to generate a file session key for encrypting + each file. SHA1 is used to calculate hash data used to + derive keys. File session keys are derived from a master + session key generated from the user-supplied password. + If the Flags field in the decryption header contains + the value 0x4000, then the ErdData field MUST be + decrypted using 3DES. If the value 0x4000 is not set, + then the ErdData field MUST be decrypted using AlgId. + + + ### 7.2.4.7 +Reserved1 - Reserved for certificate processing, if value is + zero, then Reserved2 data is absent. See the explanation + under the Certificate Processing Method for details on + this data structure. + + ### 7.2.4.8 +Reserved2 - If present, the size of the Reserved2 data structure + is located by skipping the first 4 bytes of this field + and using the next 2 bytes as the remaining size. See + the explanation under the Certificate Processing Method + for details on this data structure. + + ### 7.2.4.9 +VSize - This size value will always include the 4 bytes of the + VCRC32 data and will be greater than 4 bytes. + +### 7.2.4 + .10 VData - Random data for password validation. This data is VSize + in length and VSize MUST be a multiple of the encryption + block size. VCRC32 is a checksum value of VData. + VData and VCRC32 are stored encrypted and start the + stream of encrypted data for a file. + + +### 7.2.5 + Useful Tips + + ### 7.2.5.1 +Strong Encryption is always applied to a file after compression. The + block oriented algorithms all operate in Cypher Block Chaining (CBC) + mode. The block size used for AES encryption is 16. All other block + algorithms use a block size of 8. Two IDs are defined for RC2 to + account for a discrepancy found in the implementation of the RC2 + algorithm in the cryptographic library on Windows XP SP1 and all + earlier versions of Windows. It is recommended that zero length files + not be encrypted, however programs SHOULD be prepared to extract them + if they are found within a ZIP file. + + ### 7.2.5.2 +A pseudo-code representation of the encryption process is as follows: + + Password = GetUserPassword() + MasterSessionKey = DeriveKey(SHA1(Password)) + RD = CryptographicStrengthRandomData() + For Each File + IV = CryptographicStrengthRandomData() + VData = CryptographicStrengthRandomData() + VCRC32 = CRC32(VData) + FileSessionKey = DeriveKey(SHA1(IV + RD) + ErdData = Encrypt(RD,MasterSessionKey,IV) + Encrypt(VData + VCRC32 + FileData, FileSessionKey,IV) + Done + + ### 7.2.5.3 +The function names and parameter requirements will depend on + the choice of the cryptographic toolkit selected. Almost any + toolkit supporting the reference implementations for each + algorithm can be used. The RSA BSAFE(r), OpenSSL, and Microsoft + CryptoAPI libraries are all known to work well. + + + 7.3 Single Password - Central Directory Encryption + -------------------------------------------------- + +### 7.3.1 + Central Directory Encryption is achieved within the .ZIP format by + encrypting the Central Directory structure. This encapsulates the metadata + most often used for processing .ZIP files. Additional metadata is stored for + redundancy in the Local Header for each file. The process of concealing + metadata by encrypting the Central Directory does not protect the data within + the Local Header. To avoid information leakage from the exposed metadata + in the Local Header, the fields containing information about a file are masked. + +### 7.3.2 + Local Header + + Masking replaces the true content of the fields for a file in the Local + Header with false information. When masked, the Local Header is not + suitable for streaming access and the options for data recovery of damaged + archives is reduced. Extra Data fields that MAY contain confidential + data SHOULD NOT be stored within the Local Header. The value set into + the Version needed to extract field SHOULD be the correct value needed to + extract the file without regard to Central Directory Encryption. The fields + within the Local Header targeted for masking when the Central Directory is + encrypted are: + + Field Name Mask Value + ------------------ --------------------------- + compression method 0 + last mod file time 0 + last mod file date 0 + crc-32 0 + compressed size 0 + uncompressed size 0 + file name (variable size) Base 16 value from the + range 1 - 0xFFFFFFFFFFFFFFFF + represented as a string whose + size will be set into the + file name length field + + The Base 16 value assigned as a masked file name is simply a sequentially + incremented value for each file starting with 1 for the first file. + Modifications to a ZIP file MAY cause different values to be stored for + each file. For compatibility, the file name field in the Local Header + SHOULD NOT be left blank. As of Version 6.2 of this specification, + the Compression Method and Compressed Size fields are not yet masked. + Fields having a value of 0xFFFF or 0xFFFFFFFF for the ZIP64 format + SHOULD NOT be masked. + +### 7.3.3 + Encrypting the Central Directory + + Encryption of the Central Directory does not include encryption of the + Central Directory Signature data, the Zip64 End of Central Directory + record, the Zip64 End of Central Directory Locator, or the End + of Central Directory record. The ZIP file comment data is never + encrypted. + + Before encrypting the Central Directory, it MAY optionally be compressed. + Compression is not required, but for storage efficiency it is assumed + this structure will be compressed before encrypting. Similarly, this + specification supports compressing the Central Directory without + requiring that it also be encrypted. Early implementations of this + feature will assume the encryption method applied to files matches the + encryption applied to the Central Directory. + + Encryption of the Central Directory is done in a manner similar to + that of file encryption. The encrypted data is preceded by a + decryption header. The decryption header is known as the Archive + Decryption Header. The fields of this record are identical to + the decryption header preceding each encrypted file. The location + of the Archive Decryption Header is determined by the value in the + Start of the Central Directory field in the Zip64 End of Central + Directory record. When the Central Directory is encrypted, the + Zip64 End of Central Directory record will always be present. + + The layout of the Zip64 End of Central Directory record for all + versions starting with 6.2 of this specification will follow the + Version 2 format. The Version 2 format is as follows: + + The leading fixed size fields within the Version 1 format for this + record remain unchanged. The record signature for both Version 1 + and Version 2 will be 0x06064b50. Immediately following the last + byte of the field known as the Offset of Start of Central + Directory With Respect to the Starting Disk Number will begin the + new fields defining Version 2 of this record. + +### 7.3.4 + New fields for Version 2 + + Note: all fields stored in Intel low-byte/high-byte order. + + Value Size Description + ----- ---- ----------- + Compression Method 2 bytes Method used to compress the + Central Directory + Compressed Size 8 bytes Size of the compressed data + Original Size 8 bytes Original uncompressed size + AlgId 2 bytes Encryption algorithm ID + BitLen 2 bytes Encryption key length + Flags 2 bytes Encryption flags + HashID 2 bytes Hash algorithm identifier + Hash Length 2 bytes Length of hash data + Hash Data (variable) Hash data + + The Compression Method accepts the same range of values as the + corresponding field in the Central Header. + + The Compressed Size and Original Size values will not include the + data of the Central Directory Signature which is compressed or + encrypted. + + The AlgId, BitLen, and Flags fields accept the same range of values + the corresponding fields within the 0x0017 record. + + Hash ID identifies the algorithm used to hash the Central Directory + data. This data does not have to be hashed, in which case the + values for both the HashID and Hash Length will be 0. Possible + values for HashID are: + + Value Algorithm + ------ --------- + 0x0000 none + 0x0001 CRC32 + 0x8003 MD5 + 0x8004 SHA1 + 0x8007 RIPEMD160 + 0x800C SHA256 + 0x800D SHA384 + 0x800E SHA512 + +### 7.3.5 + When the Central Directory data is signed, the same hash algorithm + used to hash the Central Directory for signing SHOULD be used. + This is recommended for processing efficiency, however, it is + permissible for any of the above algorithms to be used independent + of the signing process. + + The Hash Data will contain the hash data for the Central Directory. + The length of this data will vary depending on the algorithm used. + + The Version Needed to Extract SHOULD be set to 62. + + The value for the Total Number of Entries on the Current Disk will + be 0. These records will no longer support random access when + encrypting the Central Directory. + +### 7.3.6 + When the Central Directory is compressed and/or encrypted, the + End of Central Directory record will store the value 0xFFFFFFFF + as the value for the Total Number of Entries in the Central + Directory. The value stored in the Total Number of Entries in + the Central Directory on this Disk field will be 0. The actual + values will be stored in the equivalent fields of the Zip64 + End of Central Directory record. + +### 7.3.7 + Decrypting and decompressing the Central Directory is accomplished + in the same manner as decrypting and decompressing a file. + + 7.4 Certificate Processing Method + --------------------------------- + + The Certificate Processing Method for ZIP file encryption + defines the following additional data fields: + +### 7.4.1 + Certificate Flag Values + + Additional processing flags that can be present in the Flags field of both + the 0x0017 field of the central directory Extra Field and the Decryption + header record preceding compressed file data are: + + 0x0007 - reserved for future use + 0x000F - reserved for future use + 0x0100 - Indicates non-OAEP key wrapping was used. If this + this field is set, the version needed to extract MUST + be at least 61. This means OAEP key wrapping is not + used when generating a Master Session Key using + ErdData. + 0x4000 - ErdData MUST be decrypted using 3DES-168, otherwise use the + same algorithm used for encrypting the file contents. + 0x8000 - reserved for future use + + +### 7.4.2 + CertData - Extra Field 0x0017 record certificate data structure + + The data structure used to store certificate data within the section + of the Extra Field defined by the CertData field of the 0x0017 + record are as shown: + + Value Size Description + ----- ---- ----------- + RCount 4 bytes Number of recipients. + HashAlg 2 bytes Hash algorithm identifier + HSize 2 bytes Hash size + SRList (var) Simple list of recipients hashed public keys + + + RCount This defines the number intended recipients whose + public keys were used for encryption. This identifies + the number of elements in the SRList. + + HashAlg This defines the hash algorithm used to calculate + the public key hash of each public key used + for encryption. This field currently supports + only the following value for SHA-1 + + 0x8004 - SHA1 + + HSize This defines the size of a hashed public key. + + SRList This is a variable length list of the hashed + public keys for each intended recipient. Each + element in this list is HSize. The total size of + SRList is determined using RCount * HSize. + + +### 7.4.3 + Reserved1 - Certificate Decryption Header Reserved1 Data + + Value Size Description + ----- ---- ----------- + RCount 4 bytes Number of recipients. + + RCount This defines the number intended recipients whose + public keys were used for encryption. This defines + the number of elements in the REList field defined below. + + +### 7.4.4 + Reserved2 - Certificate Decryption Header Reserved2 Data Structures + + + Value Size Description + ----- ---- ----------- + HashAlg 2 bytes Hash algorithm identifier + HSize 2 bytes Hash size + REList (var) List of recipient data elements + + + HashAlg This defines the hash algorithm used to calculate + the public key hash of each public key used + for encryption. This field currently supports + only the following value for SHA-1 + + 0x8004 - SHA1 + + HSize This defines the size of a hashed public key + defined in REHData. + + REList This is a variable length of list of recipient data. + Each element in this list consists of a Recipient + Element data structure as follows: + + + Recipient Element (REList) Data Structure: + + Value Size Description + ----- ---- ----------- + RESize 2 bytes Size of REHData + REKData + REHData HSize Hash of recipients public key + REKData (var) Simple key blob + + + RESize This defines the size of an individual REList + element. This value is the combined size of the + REHData field + REKData field. REHData is defined by + HSize. REKData is variable and can be calculated + for each REList element using RESize and HSize. + + REHData Hashed public key for this recipient. + + REKData Simple Key Blob. The format of this data structure + is identical to that defined in the Microsoft + CryptoAPI and generated using the CryptExportKey() + function. The version of the Simple Key Blob + supported at this time is 0x02 as defined by + Microsoft. + +## 7.5 Certificate Processing - Central Directory Encryption + +### 7.5.1 + Central Directory Encryption using Digital Certificates will + operate in a manner similar to that of Single Password Central + Directory Encryption. This record will only be present when there + is data to place into it. Currently, data is placed into this + record when digital certificates are used for either encrypting + or signing the files within a ZIP file. When only password + encryption is used with no certificate encryption or digital + signing, this record is not currently needed. When present, this + record will appear before the start of the actual Central Directory + data structure and will be located immediately after the Archive + Decryption Header if the Central Directory is encrypted. + +### 7.5.2 + The Archive Extra Data record will be used to store the following + information. Additional data MAY be added in future versions. + + Extra Data Fields: + + 0x0014 - PKCS#7 Store for X.509 Certificates + 0x0016 - X.509 Certificate ID and Signature for central directory + 0x0019 - PKCS#7 Encryption Recipient Certificate List + + The 0x0014 and 0x0016 Extra Data records that otherwise would be + located in the first record of the Central Directory for digital + certificate processing. When encrypting or compressing the Central + Directory, the 0x0014 and 0x0016 records MUST be located in the + Archive Extra Data record and they SHOULD NOT remain in the first + Central Directory record. The Archive Extra Data record will also + be used to store the 0x0019 data. + +### 7.5.3 + When present, the size of the Archive Extra Data record will be + included in the size of the Central Directory. The data of the + Archive Extra Data record will also be compressed and encrypted + along with the Central Directory data structure. + +## 7.6 Certificate Processing Differences + +### 7.6.1 + The Certificate Processing Method of encryption differs from the + Single Password Symmetric Encryption Method as follows. Instead + of using a user-defined password to generate a master session key, + cryptographically random data is used. The key material is then + wrapped using standard key-wrapping techniques. This key material + is wrapped using the public key of each recipient that will need + to decrypt the file using their corresponding private key. + +### 7.6.2 + This specification currently assumes digital certificates will follow + the X.509 V3 format for 1024 bit and higher RSA format digital + certificates. Implementation of this Certificate Processing Method + requires supporting logic for key access and management. This logic + is outside the scope of this specification. + +## 7.7 OAEP Processing with Certificate-based Encryption + +### 7.7.1 + OAEP stands for Optimal Asymmetric Encryption Padding. It is a + strengthening technique used for small encoded items such as decryption + keys. This is commonly applied in cryptographic key-wrapping techniques + and is supported by PKCS #1. Versions 5.0 and 6.0 of this specification + were designed to support OAEP key-wrapping for certificate-based + decryption keys for additional security. + +### 7.7.2 + Support for private keys stored on Smartcards or Tokens introduced + a conflict with this OAEP logic. Most card and token products do + not support the additional strengthening applied to OAEP key-wrapped + data. In order to resolve this conflict, versions 6.1 and above of this + specification will no longer support OAEP when encrypting using + digital certificates. + +### 7.7.3 + Versions of PKZIP available during initial development of the + certificate processing method set a value of 61 into the + version needed to extract field for a file. This indicates that + non-OAEP key wrapping is used. This affects certificate encryption + only, and password encryption functions SHOULD NOT be affected by + this value. This means values of 61 MAY be found on files encrypted + with certificates only, or on files encrypted with both password + encryption and certificate encryption. Files encrypted with both + methods can safely be decrypted using the password methods documented. + +## 7.8 Additional Encryption/Decryption Data Records + +### 7.8.1 + Additional information MAY be stored within a ZIP file in support + of the strong password and certificate encryption methods defined above. + These include, but are not limited to the following record types. + + 0x0021 Policy Decryption Key Record + 0x0022 Smartcrypt Key Provider Record + 0x0023 Smartcrypt Policy Key Data Record + +## 8.0 Splitting and Spanning ZIP files + + 8.1 Spanned ZIP files + +### 8.1.1 + Spanning is the process of segmenting a ZIP file across + multiple removable media. This support has typically only + been provided for DOS formatted floppy diskettes. + + 8.2 Split ZIP files + +### 8.2.1 + File splitting is a newer derivation of spanning. + Splitting follows the same segmentation process as + spanning, however, it does not require writing each + segment to a unique removable medium and instead supports + placing all pieces onto local or non-removable locations + such as file systems, local drives, folders, etc. + + 8.3 File Naming Differences + +### 8.3.1 + A key difference between spanned and split ZIP files is + that all pieces of a spanned ZIP file have the same name. + Since each piece is written to a separate volume, no name + collisions occur and each segment can reuse the original + .ZIP file name given to the archive. + +### 8.3.2 + Sequence ordering for DOS spanned archives uses the DOS + volume label to determine segment numbers. Volume labels + for each segment are written using the form PKBACK#xxx, + where xxx is the segment number written as a decimal + value from 001 - nnn. + +### 8.3.3 + Split ZIP files are typically written to the same location + and are subject to name collisions if the spanned name + format is used since each segment will reside on the same + drive. To avoid name collisions, split archives are named + as follows. + + Segment 1 = filename.z01 + Segment n-1 = filename.z(n-1) + Segment n = filename.zip + +### 8.3.4 + The .ZIP extension is used on the last segment to support + quickly reading the central directory. The segment number + n SHOULD be a decimal value. + + 8.4 Spanned Self-extracting ZIP Files + +### 8.4.1 + Spanned ZIP files MAY be PKSFX Self-extracting ZIP files. + PKSFX files MAY also be split, however, in this case + the first segment MUST be named filename.exe. The first + segment of a split PKSFX archive MUST be large enough to + include the entire executable program. + + 8.5 Capacities and Markers + +### 8.5.1 + Capacities for split archives are as follows: + + Maximum number of segments = 4,294,967,295 - 1 + Maximum .ZIP segment size = 4,294,967,295 bytes + Minimum segment size = 64K + Maximum PKSFX segment size = 2,147,483,647 bytes + +### 8.5.2 + Segment sizes MAY be different however by convention, all + segment sizes SHOULD be the same with the exception of the + last, which MAY be smaller. Local and central directory + header records MUST NOT be split across a segment boundary. + When writing a header record, if the number of bytes remaining + within a segment is less than the size of the header record, + end the current segment and write the header at the start + of the next segment. The central directory MAY span segment + boundaries, but no single record in the central directory + SHOULD be split across segments. + +### 8.5.3 + Spanned/Split archives created using PKZIP for Windows + (V2.50 or greater), PKZIP Command Line (V2.50 or greater), + or PKZIP Explorer will include a special spanning + signature as the first 4 bytes of the first segment of + the archive. This signature (0x08074b50) will be + followed immediately by the local header signature for + the first file in the archive. + +### 8.5.4 + A special spanning marker MAY also appear in spanned/split + archives if the spanning or splitting process starts but + only requires one segment. In this case the 0x08074b50 + signature will be replaced with the temporary spanning + marker signature of 0x30304b50. Split archives can + only be uncompressed by other versions of PKZIP that + know how to create a split archive. + +### 8.5.5 + The signature value 0x08074b50 is also used by some + ZIP implementations as a marker for the Data Descriptor + record. Conflict in this alternate assignment can be + avoided by ensuring the position of the signature + within the ZIP file to determine the use for which it + is intended. + +## 9.0 Change Process + + 9.1 In order for the .ZIP file format to remain a viable technology, this + specification SHOULD be considered as open for periodic review and + revision. Although this format was originally designed with a + certain level of extensibility, not all changes in technology + (present or future) were or will be necessarily considered in its + design. + + 9.2 If your application requires new definitions to the + extensible sections in this format, or if you would like to + submit new data structures or new capabilities, please forward + your request to zipformat@pkware.com. All submissions will be + reviewed by the ZIP File Specification Committee for possible + inclusion into future versions of this specification. + + 9.3 Periodic revisions to this specification will be published as + DRAFT or as FINAL status to ensure interoperability. We encourage + comments and feedback that MAY help improve clarity or content. + + +## 10.0 Incorporating PKWARE Proprietary Technology into Your Product + + 10.1 The Use or Implementation in a product of APPNOTE technological + components pertaining to either strong encryption or patching requires + a separate, executed license agreement from PKWARE. Please contact + PKWARE at zipformat@pkware.com or +1-414-289-9788 with regard to + acquiring such a license. + + 10.2 Additional information regarding PKWARE proprietary technology is + available at http://www.pkware.com/appnote. + +## 11.0 Acknowledgements + + In addition to the above mentioned contributors to PKZIP and PKUNZIP, + PKWARE would like to extend special thanks to Robert Mahoney for + suggesting the extension .ZIP for this software. + +## 12.0 References + + Fiala, Edward R., and Greene, Daniel H., "Data compression with + finite windows", Communications of the ACM, Volume 32, Number 4, + April 1989, pages 490-505. + + Held, Gilbert, "Data Compression, Techniques and Applications, + Hardware and Software Considerations", John Wiley & Sons, 1987. + + Huffman, D.A., "A method for the construction of minimum-redundancy + codes", Proceedings of the IRE, Volume 40, Number 9, September 1952, + pages 1098-1101. + + Nelson, Mark, "LZW Data Compression", Dr. Dobbs Journal, Volume 14, + Number 10, October 1989, pages 29-37. + + Nelson, Mark, "The Data Compression Book", M&T Books, 1991. + + Storer, James A., "Data Compression, Methods and Theory", + Computer Science Press, 1988 + + Welch, Terry, "A Technique for High-Performance Data Compression", + IEEE Computer, Volume 17, Number 6, June 1984, pages 8-19. + + Ziv, J. and Lempel, A., "A universal algorithm for sequential data + compression", Communications of the ACM, Volume 30, Number 6, + June 1987, pages 520-540. + + Ziv, J. and Lempel, A., "Compression of individual sequences via + variable-rate coding", IEEE Transactions on Information Theory, + Volume 24, Number 5, September 1978, pages 530-536. + + +APPENDIX A - AS/400 Extra Field (0x0065) Attribute Definitions +-------------------------------------------------------------- + +A.1 Field Definition Structure: + + a. field length including length 2 bytes Big Endian + b. field code 2 bytes + c. data x bytes + +A.2 Field Code Description + + 4001 Source type i.e. CLP etc + 4002 The text description of the library + 4003 The text description of the file + 4004 The text description of the member + 4005 x'F0' or 0 is PF-DTA, x'F1' or 1 is PF_SRC + 4007 Database Type Code 1 byte + 4008 Database file and fields definition + 4009 GZIP file type 2 bytes + 400B IFS code page 2 bytes + 400C IFS Time of last file status change 4 bytes + 400D IFS Access Time 4 bytes + 400E IFS Modification time 4 bytes + 005C Length of the records in the file 2 bytes + 0068 GZIP two words 8 bytes + +APPENDIX B - z/OS Extra Field (0x0065) Attribute Definitions +------------------------------------------------------------ + +B.1 Field Definition Structure: + + a. field length including length 2 bytes Big Endian + b. field code 2 bytes + c. data x bytes + +B.2 Field Code Description + + 0001 File Type 2 bytes + 0002 NonVSAM Record Format 1 byte + 0003 Reserved + 0004 NonVSAM Block Size 2 bytes Big Endian + 0005 Primary Space Allocation 3 bytes Big Endian + 0006 Secondary Space Allocation 3 bytes Big Endian + 0007 Space Allocation Type1 byte flag + 0008 Modification Date Retired with PKZIP 5.0 + + 0009 Expiration Date Retired with PKZIP 5.0 + + 000A PDS Directory Block Allocation 3 bytes Big Endian binary value + 000B NonVSAM Volume List variable + 000C UNIT Reference Retired with PKZIP 5.0 + + 000D DF/SMS Management Class 8 bytes EBCDIC Text Value + 000E DF/SMS Storage Class 8 bytes EBCDIC Text Value + 000F DF/SMS Data Class 8 bytes EBCDIC Text Value + 0010 PDS/PDSE Member Info. 30 bytes + 0011 VSAM sub-filetype 2 bytes + 0012 VSAM LRECL 13 bytes EBCDIC "(num_avg num_max)" + 0013 VSAM Cluster Name Retired with PKZIP 5.0 + + 0014 VSAM KSDS Key Information 13 bytes EBCDIC "(num_length num_position)" + 0015 VSAM Average LRECL 5 bytes EBCDIC num_value padded with blanks + 0016 VSAM Maximum LRECL 5 bytes EBCDIC num_value padded with blanks + 0017 VSAM KSDS Key Length 5 bytes EBCDIC num_value padded with blanks + 0018 VSAM KSDS Key Position 5 bytes EBCDIC num_value padded with blanks + 0019 VSAM Data Name 1-44 bytes EBCDIC text string + 001A VSAM KSDS Index Name 1-44 bytes EBCDIC text string + 001B VSAM Catalog Name 1-44 bytes EBCDIC text string + 001C VSAM Data Space Type 9 bytes EBCDIC text string + 001D VSAM Data Space Primary 9 bytes EBCDIC num_value left-justified + 001E VSAM Data Space Secondary 9 bytes EBCDIC num_value left-justified + 001F VSAM Data Volume List variable EBCDIC text list of 6-character Volume IDs + 0020 VSAM Data Buffer Space 8 bytes EBCDIC num_value left-justified + 0021 VSAM Data CISIZE 5 bytes EBCDIC num_value left-justified + 0022 VSAM Erase Flag 1 byte flag + 0023 VSAM Free CI % 3 bytes EBCDIC num_value left-justified + 0024 VSAM Free CA % 3 bytes EBCDIC num_value left-justified + 0025 VSAM Index Volume List variable EBCDIC text list of 6-character Volume IDs + 0026 VSAM Ordered Flag 1 byte flag + 0027 VSAM REUSE Flag 1 byte flag + 0028 VSAM SPANNED Flag 1 byte flag + 0029 VSAM Recovery Flag 1 byte flag + 002A VSAM WRITECHK Flag 1 byte flag + 002B VSAM Cluster/Data SHROPTS 3 bytes EBCDIC "n,y" + 002C VSAM Index SHROPTS 3 bytes EBCDIC "n,y" + 002D VSAM Index Space Type 9 bytes EBCDIC text string + 002E VSAM Index Space Primary 9 bytes EBCDIC num_value left-justified + 002F VSAM Index Space Secondary 9 bytes EBCDIC num_value left-justified + 0030 VSAM Index CISIZE 5 bytes EBCDIC num_value left-justified + 0031 VSAM Index IMBED 1 byte flag + 0032 VSAM Index Ordered Flag 1 byte flag + 0033 VSAM REPLICATE Flag 1 byte flag + 0034 VSAM Index REUSE Flag 1 byte flag + 0035 VSAM Index WRITECHK Flag 1 byte flag Retired with PKZIP 5.0 + + 0036 VSAM Owner 8 bytes EBCDIC text string + 0037 VSAM Index Owner 8 bytes EBCDIC text string + 0038 Reserved + 0039 Reserved + 003A Reserved + 003B Reserved + 003C Reserved + 003D Reserved + 003E Reserved + 003F Reserved + 0040 Reserved + 0041 Reserved + 0042 Reserved + 0043 Reserved + 0044 Reserved + 0045 Reserved + 0046 Reserved + 0047 Reserved + 0048 Reserved + 0049 Reserved + 004A Reserved + 004B Reserved + 004C Reserved + 004D Reserved + 004E Reserved + 004F Reserved + 0050 Reserved + 0051 Reserved + 0052 Reserved + 0053 Reserved + 0054 Reserved + 0055 Reserved + 0056 Reserved + 0057 Reserved + 0058 PDS/PDSE Member TTR Info. 6 bytes Big Endian + 0059 PDS 1st LMOD Text TTR 3 bytes Big Endian + 005A PDS LMOD EP Rec # 4 bytes Big Endian + 005B Reserved + 005C Max Length of records 2 bytes Big Endian + 005D PDSE Flag 1 byte flag + 005E Reserved + 005F Reserved + 0060 Reserved + 0061 Reserved + 0062 Reserved + 0063 Reserved + 0064 Reserved + 0065 Last Date Referenced 4 bytes Packed Hex "yyyymmdd" + 0066 Date Created 4 bytes Packed Hex "yyyymmdd" + 0068 GZIP two words 8 bytes + 0071 Extended NOTE Location 12 bytes Big Endian + 0072 Archive device UNIT 6 bytes EBCDIC + 0073 Archive 1st Volume 6 bytes EBCDIC + 0074 Archive 1st VOL File Seq# 2 bytes Binary + 0075 Native I/O Flags 2 bytes + 0081 Unix File Type 1 byte enumerated + 0082 Unix File Format 1 byte enumerated + 0083 Unix File Character Set Tag Info 4 bytes + 0090 ZIP Environmental Processing Info 4 bytes + 0091 EAV EATTR Flags 1 byte + 0092 DSNTYPE Flags 1 byte + 0093 Total Space Allocation (Cyls) 4 bytes Big Endian + 009D NONVSAM DSORG 2 bytes + 009E Program Virtual Object Info 3 bytes + 009F Encapsulated file Info 9 bytes + 400C Unix File Creation Time 4 bytes + 400D Unix File Access Time 4 bytes + 400E Unix File Modification time 4 bytes + 4101 IBMCMPSC Compression Info variable + 4102 IBMCMPSC Compression Size 8 bytes Big Endian + +APPENDIX C - Zip64 Extensible Data Sector Mappings +--------------------------------------------------- + + -Z390 Extra Field: + + The following is the general layout of the attributes for the + ZIP 64 "extra" block for extended tape operations. + + Note: some fields stored in Big Endian format. All text is + in EBCDIC format unless otherwise specified. + + Value Size Description + ----- ---- ----------- + (Z390) 0x0065 2 bytes Tag for this "extra" block type + Size 4 bytes Size for the following data block + Tag 4 bytes EBCDIC "Z390" + Length71 2 bytes Big Endian + Subcode71 2 bytes Enote type code + FMEPos 1 byte + Length72 2 bytes Big Endian + Subcode72 2 bytes Unit type code + Unit 1 byte Unit + Length73 2 bytes Big Endian + Subcode73 2 bytes Volume1 type code + FirstVol 1 byte Volume + Length74 2 bytes Big Endian + Subcode74 2 bytes FirstVol file sequence + FileSeq 2 bytes Sequence + +APPENDIX D - Language Encoding (EFS) +------------------------------------ + +D.1 The ZIP format has historically supported only the original IBM PC character +encoding set, commonly referred to as IBM Code Page 437. This limits storing +file name characters to only those within the original MS-DOS range of values +and does not properly support file names in other character encodings, or +languages. To address this limitation, this specification will support the +following change. + +D.2 If general purpose bit 11 is unset, the file name and comment SHOULD conform +to the original ZIP character encoding. If general purpose bit 11 is set, the +filename and comment MUST support The Unicode Standard, Version 4.1.0 or +greater using the character encoding form defined by the UTF-8 storage +specification. The Unicode Standard is published by the The Unicode +Consortium (www.unicode.org). UTF-8 encoded data stored within ZIP files +is expected to not include a byte order mark (BOM). + +D.3 Applications MAY choose to supplement this file name storage through the use +of the 0x0008 Extra Field. Storage for this optional field is currently +undefined, however it will be used to allow storing extended information +on source or target encoding that MAY further assist applications with file +name, or file content encoding tasks. Please contact PKWARE with any +requirements on how this field SHOULD be used. + +D.4 The 0x0008 Extra Field storage MAY be used with either setting for general +purpose bit 11. Examples of the intended usage for this field is to store +whether "modified-UTF-8" (JAVA) is used, or UTF-8-MAC. Similarly, other +commonly used character encoding (code page) designations can be indicated +through this field. Formalized values for use of the 0x0008 record remain +undefined at this time. The definition for the layout of the 0x0008 field +will be published when available. Use of the 0x0008 Extra Field provides +for storing data within a ZIP file in an encoding other than IBM Code +Page 437 or UTF-8. + +D.5 General purpose bit 11 will not imply any encoding of file content or +password. Values defining character encoding for file content or +password MUST be stored within the 0x0008 Extended Language Encoding +Extra Field. + +D.6 Ed Gordon of the Info-ZIP group has defined a pair of "extra field" records +that can be used to store UTF-8 file name and file comment fields. These +records can be used for cases when the general purpose bit 11 method +for storing UTF-8 data in the standard file name and comment fields is +not desirable. A common case for this alternate method is if backward +compatibility with older programs is required. + +D.7 Definitions for the record structure of these fields are included above +in the section on 3rd party mappings for "extra field" records. These +records are identified by Header ID's 0x6375 (Info-ZIP Unicode Comment +Extra Field) and 0x7075 (Info-ZIP Unicode Path Extra Field). + +D.8 The choice of which storage method to use when writing a ZIP file is left +to the implementation. Developers SHOULD expect that a ZIP file MAY +contain either method and SHOULD provide support for reading data in +either format. Use of general purpose bit 11 reduces storage requirements +for file name data by not requiring additional "extra field" data for +each file, but can result in older ZIP programs not being able to extract +files. Use of the 0x6375 and 0x7075 records will result in a ZIP file +that SHOULD always be readable by older ZIP programs, but requires more +storage per file to write file name and/or file comment fields. + +APPENDIX E - AE-x encryption marker +----------------------------------- + +E.1 AE-x defines an alternate password-based encryption method used +in ZIP files that is based on a file encryption utility developed by +Dr. Brian Gladman. Information on Dr. Gladman's method is available at + + http://www.gladman.me.uk/cryptography_technology/fileencrypt/ + +E.2 AE-x uses AES with CTR (counter mode) and HMAC-SHA1. It defines +encryption using key sizes of 128 bits or 256 bits. It does not +restrict support for decrypting 192 bits. + +E.3 This method uses the standard ZIP encryption bit (bit 0) +of the general purpose bit flag (section 4.4.4) to indicate a +file is encrypted. + +E.4 The compression method field (section 4.4.5) is set to 99 +to indicate a file has been encrypted using this method. + +E.5 The actual compression method is stored in an extra field +structure identified by a Header ID of 0x9901. Information on this +record structure can be found at http://www.winzip.com/aes_info.htm. + +E.6 Two versions are defined for the 0x9901 structure. + + E.6.1 Version 1 stores the file CRC value in the CRC-32 field + (section 4.4.7). + + E.6.2 Version 2 stores a value of 0 in the CRC-32 field. diff --git a/crates/async_zip/rustfmt.toml b/crates/async_zip/rustfmt.toml new file mode 100644 index 0000000..c775577 --- /dev/null +++ b/crates/async_zip/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 120 +use_small_heuristics = "Max" diff --git a/crates/async_zip/src/base/mod.rs b/crates/async_zip/src/base/mod.rs new file mode 100644 index 0000000..67b5b60 --- /dev/null +++ b/crates/async_zip/src/base/mod.rs @@ -0,0 +1,7 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A base runtime-agnostic implementation using `futures`'s IO types. + +pub mod read; +pub mod write; diff --git a/crates/async_zip/src/base/read/io/combined_record.rs b/crates/async_zip/src/base/read/io/combined_record.rs new file mode 100644 index 0000000..d3d41d9 --- /dev/null +++ b/crates/async_zip/src/base/read/io/combined_record.rs @@ -0,0 +1,68 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// Copyright (c) 2023 Cognite AS +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::spec::header::{EndOfCentralDirectoryHeader, Zip64EndOfCentralDirectoryRecord}; + +/// Combines all the fields in EOCDR and Zip64EOCDR into one struct. +#[derive(Debug)] +pub struct CombinedCentralDirectoryRecord { + pub version_made_by: Option, + pub version_needed_to_extract: Option, + pub disk_number: u32, + pub disk_number_start_of_cd: u32, + pub num_entries_in_directory_on_disk: u64, + pub num_entries_in_directory: u64, + pub directory_size: u64, + pub offset_of_start_of_directory: u64, + pub file_comment_length: u16, +} + +impl CombinedCentralDirectoryRecord { + /// Combine an EOCDR with an optional Zip64EOCDR. + /// + /// Fields that are set to their max value in the EOCDR will be overwritten by the contents of + /// the corresponding Zip64EOCDR field. + pub fn combine(eocdr: EndOfCentralDirectoryHeader, zip64eocdr: Zip64EndOfCentralDirectoryRecord) -> Self { + let mut combined = Self::from(&eocdr); + if eocdr.disk_num == u16::MAX { + combined.disk_number = zip64eocdr.disk_number; + } + if eocdr.start_cent_dir_disk == u16::MAX { + combined.disk_number_start_of_cd = zip64eocdr.disk_number_start_of_cd; + } + if eocdr.num_of_entries_disk == u16::MAX { + combined.num_entries_in_directory_on_disk = zip64eocdr.num_entries_in_directory_on_disk; + } + if eocdr.num_of_entries == u16::MAX { + combined.num_entries_in_directory = zip64eocdr.num_entries_in_directory; + } + if eocdr.size_cent_dir == u32::MAX { + combined.directory_size = zip64eocdr.directory_size; + } + if eocdr.cent_dir_offset == u32::MAX { + combined.offset_of_start_of_directory = zip64eocdr.offset_of_start_of_directory; + } + combined.version_made_by = Some(zip64eocdr.version_made_by); + combined.version_needed_to_extract = Some(zip64eocdr.version_needed_to_extract); + + combined + } +} + +// An implementation for the case of no zip64EOCDR. +impl From<&EndOfCentralDirectoryHeader> for CombinedCentralDirectoryRecord { + fn from(header: &EndOfCentralDirectoryHeader) -> Self { + Self { + version_made_by: None, + version_needed_to_extract: None, + disk_number: header.disk_num as u32, + disk_number_start_of_cd: header.start_cent_dir_disk as u32, + num_entries_in_directory_on_disk: header.num_of_entries_disk as u64, + num_entries_in_directory: header.num_of_entries as u64, + directory_size: header.size_cent_dir as u64, + offset_of_start_of_directory: header.cent_dir_offset as u64, + file_comment_length: header.file_comm_length, + } + } +} diff --git a/crates/async_zip/src/base/read/io/compressed.rs b/crates/async_zip/src/base/read/io/compressed.rs new file mode 100644 index 0000000..8fc6b87 --- /dev/null +++ b/crates/async_zip/src/base/read/io/compressed.rs @@ -0,0 +1,103 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::spec::Compression; + +use std::pin::Pin; +use std::task::{Context, Poll}; + +#[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" +))] +use async_compression::futures::bufread; +use futures_lite::io::{AsyncBufRead, AsyncRead}; +use pin_project::pin_project; + +/// A wrapping reader which holds concrete types for all respective compression method readers. +#[pin_project(project = CompressedReaderProj)] +pub(crate) enum CompressedReader { + Stored(#[pin] R), + #[cfg(feature = "deflate")] + Deflate(#[pin] bufread::DeflateDecoder), + #[cfg(feature = "deflate64")] + Deflate64(#[pin] bufread::Deflate64Decoder), + #[cfg(feature = "bzip2")] + Bz(#[pin] bufread::BzDecoder), + #[cfg(feature = "lzma")] + Lzma(#[pin] bufread::LzmaDecoder), + #[cfg(feature = "zstd")] + Zstd(#[pin] bufread::ZstdDecoder), + #[cfg(feature = "xz")] + Xz(#[pin] bufread::XzDecoder), +} + +impl CompressedReader +where + R: AsyncBufRead + Unpin, +{ + /// Constructs a new wrapping reader from a generic [`AsyncBufRead`] implementer. + pub(crate) fn new(reader: R, compression: Compression) -> Self { + match compression { + Compression::Stored => CompressedReader::Stored(reader), + #[cfg(feature = "deflate")] + Compression::Deflate => CompressedReader::Deflate(bufread::DeflateDecoder::new(reader)), + #[cfg(feature = "deflate64")] + Compression::Deflate64 => CompressedReader::Deflate64(bufread::Deflate64Decoder::new(reader)), + #[cfg(feature = "bzip2")] + Compression::Bz => CompressedReader::Bz(bufread::BzDecoder::new(reader)), + #[cfg(feature = "lzma")] + Compression::Lzma => CompressedReader::Lzma(bufread::LzmaDecoder::new(reader)), + #[cfg(feature = "zstd")] + Compression::Zstd => CompressedReader::Zstd(bufread::ZstdDecoder::new(reader)), + #[cfg(feature = "xz")] + Compression::Xz => CompressedReader::Xz(bufread::XzDecoder::new(reader)), + } + } + + /// Consumes this reader and returns the inner value. + pub(crate) fn into_inner(self) -> R { + match self { + CompressedReader::Stored(inner) => inner, + #[cfg(feature = "deflate")] + CompressedReader::Deflate(inner) => inner.into_inner(), + #[cfg(feature = "deflate64")] + CompressedReader::Deflate64(inner) => inner.into_inner(), + #[cfg(feature = "bzip2")] + CompressedReader::Bz(inner) => inner.into_inner(), + #[cfg(feature = "lzma")] + CompressedReader::Lzma(inner) => inner.into_inner(), + #[cfg(feature = "zstd")] + CompressedReader::Zstd(inner) => inner.into_inner(), + #[cfg(feature = "xz")] + CompressedReader::Xz(inner) => inner.into_inner(), + } + } +} + +impl AsyncRead for CompressedReader +where + R: AsyncBufRead + Unpin, +{ + fn poll_read(self: Pin<&mut Self>, c: &mut Context<'_>, b: &mut [u8]) -> Poll> { + match self.project() { + CompressedReaderProj::Stored(inner) => inner.poll_read(c, b), + #[cfg(feature = "deflate")] + CompressedReaderProj::Deflate(inner) => inner.poll_read(c, b), + #[cfg(feature = "deflate64")] + CompressedReaderProj::Deflate64(inner) => inner.poll_read(c, b), + #[cfg(feature = "bzip2")] + CompressedReaderProj::Bz(inner) => inner.poll_read(c, b), + #[cfg(feature = "lzma")] + CompressedReaderProj::Lzma(inner) => inner.poll_read(c, b), + #[cfg(feature = "zstd")] + CompressedReaderProj::Zstd(inner) => inner.poll_read(c, b), + #[cfg(feature = "xz")] + CompressedReaderProj::Xz(inner) => inner.poll_read(c, b), + } + } +} diff --git a/crates/async_zip/src/base/read/io/entry.rs b/crates/async_zip/src/base/read/io/entry.rs new file mode 100644 index 0000000..64e81c6 --- /dev/null +++ b/crates/async_zip/src/base/read/io/entry.rs @@ -0,0 +1,128 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::read::io::{compressed::CompressedReader, hashed::HashedReader, owned::OwnedReader}; +use crate::entry::ZipEntry; +use crate::error::{Result, ZipError}; +use crate::spec::Compression; + +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures_lite::io::{AsyncBufRead, AsyncRead, AsyncReadExt, Take}; +use pin_project::pin_project; + +/// A type which encodes that [`ZipEntryReader`] has associated entry data. +pub struct WithEntry<'a>(OwnedEntry<'a>); + +/// A type which encodes that [`ZipEntryReader`] has no associated entry data. +pub struct WithoutEntry; + +/// A ZIP entry reader which may implement decompression. +#[pin_project] +pub struct ZipEntryReader<'a, R, E> { + #[pin] + reader: HashedReader>>>, + entry: E, +} + +impl<'a, R> ZipEntryReader<'a, R, WithoutEntry> +where + R: AsyncBufRead + Unpin, +{ + /// Constructs a new entry reader from its required parameters (incl. an owned R). + pub fn new_with_owned(reader: R, compression: Compression, size: u64) -> Self { + let reader = HashedReader::new(CompressedReader::new(OwnedReader::Owned(reader).take(size), compression)); + Self { reader, entry: WithoutEntry } + } + + /// Constructs a new entry reader from its required parameters (incl. a mutable borrow of an R). + pub(crate) fn new_with_borrow(reader: &'a mut R, compression: Compression, size: u64) -> Self { + let reader = HashedReader::new(CompressedReader::new(OwnedReader::Borrow(reader).take(size), compression)); + Self { reader, entry: WithoutEntry } + } + + pub(crate) fn into_with_entry(self, entry: &'a ZipEntry) -> ZipEntryReader<'a, R, WithEntry<'a>> { + ZipEntryReader { reader: self.reader, entry: WithEntry(OwnedEntry::Borrow(entry)) } + } + + pub(crate) fn into_with_entry_owned(self, entry: ZipEntry) -> ZipEntryReader<'a, R, WithEntry<'a>> { + ZipEntryReader { reader: self.reader, entry: WithEntry(OwnedEntry::Owned(entry)) } + } +} + +impl<'a, R, E> AsyncRead for ZipEntryReader<'a, R, E> +where + R: AsyncBufRead + Unpin, +{ + fn poll_read(self: Pin<&mut Self>, c: &mut Context<'_>, b: &mut [u8]) -> Poll> { + self.project().reader.poll_read(c, b) + } +} + +impl<'a, R, E> ZipEntryReader<'a, R, E> +where + R: AsyncBufRead + Unpin, +{ + /// Computes and returns the CRC32 hash of bytes read by this reader so far. + /// + /// This hash should only be computed once EOF has been reached. + pub fn compute_hash(&mut self) -> u32 { + self.reader.swap_and_compute_hash() + } + + /// Consumes this reader and returns the inner value. + pub(crate) fn into_inner(self) -> R { + self.reader.into_inner().into_inner().into_inner().owned_into_inner() + } +} + +impl ZipEntryReader<'_, R, WithEntry<'_>> +where + R: AsyncBufRead + Unpin, +{ + /// Returns an immutable reference to the associated entry data. + pub fn entry(&self) -> &'_ ZipEntry { + self.entry.0.entry() + } + + /// Reads all bytes until EOF has been reached, appending them to buf, and verifies the CRC32 values. + /// + /// This is a helper function synonymous to [`AsyncReadExt::read_to_end()`]. + pub async fn read_to_end_checked(&mut self, buf: &mut Vec) -> Result { + let read = self.read_to_end(buf).await?; + + if self.compute_hash() == self.entry.0.entry().crc32() { + Ok(read) + } else { + Err(ZipError::CRC32CheckError) + } + } + + /// Reads all bytes until EOF has been reached, placing them into buf, and verifies the CRC32 values. + /// + /// This is a helper function synonymous to [`AsyncReadExt::read_to_string()`]. + pub async fn read_to_string_checked(&mut self, buf: &mut String) -> Result { + let read = self.read_to_string(buf).await?; + + if self.compute_hash() == self.entry.0.entry().crc32() { + Ok(read) + } else { + Err(ZipError::CRC32CheckError) + } + } +} + +enum OwnedEntry<'a> { + Owned(ZipEntry), + Borrow(&'a ZipEntry), +} + +impl<'a> OwnedEntry<'a> { + pub fn entry(&self) -> &'_ ZipEntry { + match self { + OwnedEntry::Owned(entry) => entry, + OwnedEntry::Borrow(entry) => entry, + } + } +} diff --git a/crates/async_zip/src/base/read/io/hashed.rs b/crates/async_zip/src/base/read/io/hashed.rs new file mode 100644 index 0000000..1190f0d --- /dev/null +++ b/crates/async_zip/src/base/read/io/hashed.rs @@ -0,0 +1,56 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::read::io::poll_result_ok; + +use std::pin::Pin; +use std::task::{ready, Context, Poll}; + +use crc32fast::Hasher; +use futures_lite::io::AsyncRead; +use pin_project::pin_project; + +/// A wrapping reader which computes the CRC32 hash of data read via [`AsyncRead`]. +#[pin_project] +pub(crate) struct HashedReader { + #[pin] + pub(crate) reader: R, + pub(crate) hasher: Hasher, +} + +impl HashedReader +where + R: AsyncRead + Unpin, +{ + /// Constructs a new wrapping reader from a generic [`AsyncRead`] implementer. + pub(crate) fn new(reader: R) -> Self { + Self { reader, hasher: Hasher::default() } + } + + /// Swaps the internal hasher and returns the computed CRC32 hash. + /// + /// The internal hasher is taken and replaced with a newly-constructed one. As a result, this method should only be + /// called once EOF has been reached and it's known that no more data will be read, else the computed hash(s) won't + /// accurately represent the data read in. + pub(crate) fn swap_and_compute_hash(&mut self) -> u32 { + std::mem::take(&mut self.hasher).finalize() + } + + /// Consumes this reader and returns the inner value. + pub(crate) fn into_inner(self) -> R { + self.reader + } +} + +impl AsyncRead for HashedReader +where + R: AsyncRead + Unpin, +{ + fn poll_read(self: Pin<&mut Self>, c: &mut Context<'_>, b: &mut [u8]) -> Poll> { + let project = self.project(); + let written = poll_result_ok!(ready!(project.reader.poll_read(c, b))); + project.hasher.update(&b[..written]); + + Poll::Ready(Ok(written)) + } +} diff --git a/crates/async_zip/src/base/read/io/locator.rs b/crates/async_zip/src/base/read/io/locator.rs new file mode 100644 index 0000000..a2e9c5f --- /dev/null +++ b/crates/async_zip/src/base/read/io/locator.rs @@ -0,0 +1,96 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! +//! +//! As with other ZIP libraries, we face the predicament that the end of central directory record may contain a +//! variable-length file comment. As a result, we cannot just make the assumption that the start of this record is +//! 18 bytes (the length of the EOCDR) offset from the end of the data - we must locate it ourselves. +//! +//! The `zip-rs` crate handles this by reading in reverse from the end of the data. This involves seeking backwards +//! by a single byte each iteration and reading 4 bytes into a u32. Whether this is performant/acceptable within a +//! a non-async context, I'm unsure, but it isn't desirable within an async context. Especially since we cannot just +//! place a [`BufReader`] infront of the upstream reader (as its internal buffer is invalidated on each seek). +//! +//! Reading in reverse is still desirable as the use of file comments is limited and they're unlikely to be large. +//! +//! The below method is one that compromises on these two contention points. Please submit an issue or PR if you know +//! of a better algorithm for this (and have tested/verified its performance). + +#[cfg(doc)] +use futures_lite::io::BufReader; + +use crate::error::{Result as ZipResult, ZipError}; +use crate::spec::consts::{EOCDR_LENGTH, EOCDR_SIGNATURE, SIGNATURE_LENGTH}; + +use futures_lite::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, SeekFrom}; + +/// The buffer size used when locating the EOCDR, equal to 2KiB. +const BUFFER_SIZE: usize = 2048; + +/// The upper bound of where the EOCDR signature cannot be located. +const EOCDR_UPPER_BOUND: u64 = EOCDR_LENGTH as u64; + +/// The lower bound of where the EOCDR signature cannot be located. +const EOCDR_LOWER_BOUND: u64 = EOCDR_UPPER_BOUND + SIGNATURE_LENGTH as u64 + u16::MAX as u64; + +/// Locate the `end of central directory record` offset, if one exists. +/// The returned offset excludes the signature (4 bytes) +/// +/// This method involves buffered reading in reverse and reverse linear searching along those buffers for the EOCDR +/// signature. As a result of this buffered approach, we reduce seeks when compared to `zip-rs`'s method by a factor +/// of the buffer size. We also then don't have to do individual u32 reads against the upstream reader. +/// +/// Whilst I haven't done any in-depth benchmarks, when reading a ZIP file with the maximum length comment, this method +/// saw a reduction in location time by a factor of 500 when compared with the `zip-rs` method. +pub async fn eocdr(mut reader: R) -> ZipResult +where + R: AsyncRead + AsyncSeek + Unpin, +{ + let length = reader.seek(SeekFrom::End(0)).await?; + let signature = &EOCDR_SIGNATURE.to_le_bytes(); + let mut buffer: [u8; BUFFER_SIZE] = [0; BUFFER_SIZE]; + + let mut position = length.saturating_sub((EOCDR_LENGTH + BUFFER_SIZE) as u64); + reader.seek(SeekFrom::Start(position)).await?; + + loop { + let read = reader.read(&mut buffer).await?; + + if let Some(match_index) = reverse_search_buffer(&buffer[..read], signature) { + return Ok(position + (match_index + 1) as u64); + } + + // If we hit the start of the data or the lower bound, we're unable to locate the EOCDR. + if position == 0 || position <= length.saturating_sub(EOCDR_LOWER_BOUND) { + return Err(ZipError::UnableToLocateEOCDR); + } + + // To handle the case where the EOCDR signature crosses buffer boundaries, we simply overlap reads by the + // signature length. This significantly reduces the complexity of handling partial matches with very little + // overhead. + position = position.saturating_sub((BUFFER_SIZE - SIGNATURE_LENGTH) as u64); + reader.seek(SeekFrom::Start(position)).await?; + } +} + +/// A naive reverse linear search along the buffer for the specified signature bytes. +/// +/// This is already surprisingly performant. For instance, using memchr::memchr() to match for the first byte of the +/// signature, and then manual byte comparisons for the remaining signature bytes was actually slower by a factor of +/// 2.25. This method was explored as tokio's `read_until()` implementation uses memchr::memchr(). +pub(crate) fn reverse_search_buffer(buffer: &[u8], signature: &[u8]) -> Option { + 'outer: for index in (0..buffer.len()).rev() { + for (signature_index, signature_byte) in signature.iter().rev().enumerate() { + if let Some(next_index) = index.checked_sub(signature_index) { + if buffer[next_index] != *signature_byte { + continue 'outer; + } + } else { + break 'outer; + } + } + return Some(index); + } + None +} diff --git a/crates/async_zip/src/base/read/io/mod.rs b/crates/async_zip/src/base/read/io/mod.rs new file mode 100644 index 0000000..86af934 --- /dev/null +++ b/crates/async_zip/src/base/read/io/mod.rs @@ -0,0 +1,88 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod combined_record; +pub(crate) mod compressed; +pub(crate) mod entry; +pub(crate) mod hashed; +pub(crate) mod locator; +pub(crate) mod owned; + +use std::{ + future::Future, + io::ErrorKind, + pin::Pin, + task::{ready, Context, Poll}, +}; + +pub use combined_record::CombinedCentralDirectoryRecord; +use futures_lite::io::AsyncBufRead; +use pin_project::pin_project; + +use crate::{ + spec::consts::{DATA_DESCRIPTOR_LENGTH, DATA_DESCRIPTOR_SIGNATURE, SIGNATURE_LENGTH}, + string::{StringEncoding, ZipString}, +}; +use futures_lite::io::{AsyncRead, AsyncReadExt}; + +/// Read and return a dynamic length string from a reader which impls AsyncRead. +pub(crate) async fn read_string(reader: R, length: usize, encoding: StringEncoding) -> std::io::Result +where + R: AsyncRead + Unpin, +{ + Ok(ZipString::new(read_bytes(reader, length).await?, encoding)) +} + +/// Read and return a dynamic length vector of bytes from a reader which impls AsyncRead. +pub(crate) async fn read_bytes(reader: R, length: usize) -> std::io::Result> +where + R: AsyncRead + Unpin, +{ + let mut buffer = Vec::with_capacity(length); + reader.take(length as u64).read_to_end(&mut buffer).await?; + + Ok(buffer) +} + +#[pin_project] +pub(crate) struct ConsumeDataDescriptor<'a, R>(#[pin] pub(crate) &'a mut R); + +impl Future for ConsumeDataDescriptor<'_, R> +where + R: AsyncBufRead + Unpin, +{ + type Output = std::io::Result<()>; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let mut project = self.project(); + + let data = poll_result_ok!(ready!(project.0.as_mut().poll_fill_buf(cx))); + let signature = data.get(0..4).ok_or(ErrorKind::UnexpectedEof)?; + let mut consumed = DATA_DESCRIPTOR_LENGTH; + + if signature == DATA_DESCRIPTOR_SIGNATURE.to_le_bytes() { + consumed += SIGNATURE_LENGTH; + } + if consumed > data.len() { + return Poll::Ready(Err(ErrorKind::UnexpectedEof.into())); + } + + project.0.as_mut().consume(consumed); + Poll::Ready(Ok(())) + } +} + +/// A macro that returns the inner value of an Ok or early-returns in the case of an Err. +/// +/// This is almost identical to the ? operator but handles the situation when a Result is used in combination with +/// Poll (eg. tokio's IO traits such as AsyncRead). +macro_rules! poll_result_ok { + ($poll:expr) => { + match $poll { + Ok(inner) => inner, + Err(err) => return Poll::Ready(Err(err)), + } + }; +} + +use poll_result_ok; diff --git a/crates/async_zip/src/base/read/io/owned.rs b/crates/async_zip/src/base/read/io/owned.rs new file mode 100644 index 0000000..371ffab --- /dev/null +++ b/crates/async_zip/src/base/read/io/owned.rs @@ -0,0 +1,62 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures_lite::io::{AsyncBufRead, AsyncRead}; +use pin_project::pin_project; + +/// A wrapping reader which holds an owned R or a mutable borrow to R. +/// +/// This is used to represent whether the supplied reader can be acted on concurrently or not (with an owned value +/// suggesting that R implements some method of synchronisation & cloning). +#[pin_project(project = OwnedReaderProj)] +pub(crate) enum OwnedReader<'a, R> { + Owned(#[pin] R), + Borrow(#[pin] &'a mut R), +} + +impl<'a, R> OwnedReader<'a, R> +where + R: AsyncBufRead + Unpin, +{ + /// Consumes an owned reader and returns the inner value. + pub(crate) fn owned_into_inner(self) -> R { + match self { + OwnedReader::Owned(inner) => inner, + OwnedReader::Borrow(_) => panic!("not OwnedReader::Owned value"), + } + } +} + +impl<'a, R> AsyncBufRead for OwnedReader<'a, R> +where + R: AsyncBufRead + Unpin, +{ + fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.project() { + OwnedReaderProj::Owned(inner) => inner.poll_fill_buf(cx), + OwnedReaderProj::Borrow(inner) => inner.poll_fill_buf(cx), + } + } + + fn consume(self: Pin<&mut Self>, amt: usize) { + match self.project() { + OwnedReaderProj::Owned(inner) => inner.consume(amt), + OwnedReaderProj::Borrow(inner) => inner.consume(amt), + } + } +} + +impl<'a, R> AsyncRead for OwnedReader<'a, R> +where + R: AsyncBufRead + Unpin, +{ + fn poll_read(self: Pin<&mut Self>, c: &mut Context<'_>, b: &mut [u8]) -> Poll> { + match self.project() { + OwnedReaderProj::Owned(inner) => inner.poll_read(c, b), + OwnedReaderProj::Borrow(inner) => inner.poll_read(c, b), + } + } +} diff --git a/crates/async_zip/src/base/read/mem.rs b/crates/async_zip/src/base/read/mem.rs new file mode 100644 index 0000000..c8fa9f1 --- /dev/null +++ b/crates/async_zip/src/base/read/mem.rs @@ -0,0 +1,147 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A concurrent ZIP reader which acts over an owned vector of bytes. +//! +//! Concurrency is achieved as a result of: +//! - Wrapping the provided vector of bytes within an [`Arc`] to allow shared ownership. +//! - Wrapping this [`Arc`] around a [`Cursor`] when reading (as the [`Arc`] can deref and coerce into a `&[u8]`). +//! +//! ### Usage +//! Unlike the [`seek`] module, we no longer hold a mutable reference to any inner reader which in turn, allows the +//! construction of concurrent [`ZipEntryReader`]s. Though, note that each individual [`ZipEntryReader`] cannot be sent +//! between thread boundaries due to the masked lifetime requirement. Therefore, the overarching [`ZipFileReader`] +//! should be cloned and moved into those contexts when needed. +//! +//! ### Concurrent Example +//! ```no_run +//! # use async_zip::base::read::mem::ZipFileReader; +//! # use async_zip::error::Result; +//! # use futures_lite::io::AsyncReadExt; +//! # +//! async fn run() -> Result<()> { +//! let reader = ZipFileReader::new(Vec::new()).await?; +//! let result = tokio::join!(read(&reader, 0), read(&reader, 1)); +//! +//! let data_0 = result.0?; +//! let data_1 = result.1?; +//! +//! // Use data within current scope. +//! +//! Ok(()) +//! } +//! +//! async fn read(reader: &ZipFileReader, index: usize) -> Result> { +//! let mut entry = reader.reader_without_entry(index).await?; +//! let mut data = Vec::new(); +//! entry.read_to_end(&mut data).await?; +//! Ok(data) +//! } +//! ``` +//! +//! ### Parallel Example +//! ```no_run +//! # use async_zip::base::read::mem::ZipFileReader; +//! # use async_zip::error::Result; +//! # use futures_lite::io::AsyncReadExt; +//! # +//! async fn run() -> Result<()> { +//! let reader = ZipFileReader::new(Vec::new()).await?; +//! +//! let handle_0 = tokio::spawn(read(reader.clone(), 0)); +//! let handle_1 = tokio::spawn(read(reader.clone(), 1)); +//! +//! let data_0 = handle_0.await.expect("thread panicked")?; +//! let data_1 = handle_1.await.expect("thread panicked")?; +//! +//! // Use data within current scope. +//! +//! Ok(()) +//! } +//! +//! async fn read(reader: ZipFileReader, index: usize) -> Result> { +//! let mut entry = reader.reader_without_entry(index).await?; +//! let mut data = Vec::new(); +//! entry.read_to_end(&mut data).await?; +//! Ok(data) +//! } +//! ``` + +#[cfg(doc)] +use crate::base::read::seek; + +use crate::base::read::io::entry::ZipEntryReader; +use crate::error::{Result, ZipError}; +use crate::file::ZipFile; + +use std::sync::Arc; + +use futures_lite::io::Cursor; + +use super::io::entry::{WithEntry, WithoutEntry}; + +struct Inner { + data: Vec, + file: ZipFile, +} + +// A concurrent ZIP reader which acts over an owned vector of bytes. +#[derive(Clone)] +pub struct ZipFileReader { + inner: Arc, +} + +impl ZipFileReader { + /// Constructs a new ZIP reader from an owned vector of bytes. + pub async fn new(data: Vec) -> Result { + let file = crate::base::read::file(Cursor::new(&data)).await?; + Ok(ZipFileReader::from_raw_parts(data, file)) + } + + /// Constructs a ZIP reader from an owned vector of bytes and ZIP file information derived from those bytes. + /// + /// Providing a [`ZipFile`] that wasn't derived from those bytes may lead to inaccurate parsing. + pub fn from_raw_parts(data: Vec, file: ZipFile) -> ZipFileReader { + ZipFileReader { inner: Arc::new(Inner { data, file }) } + } + + /// Returns this ZIP file's information. + pub fn file(&self) -> &ZipFile { + &self.inner.file + } + + /// Returns the raw bytes provided to the reader during construction. + pub fn data(&self) -> &[u8] { + &self.inner.data + } + + /// Returns a new entry reader if the provided index is valid. + pub async fn reader_without_entry(&self, index: usize) -> Result, WithoutEntry>> { + let stored_entry = self.inner.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + let mut cursor = Cursor::new(&self.inner.data[..]); + + stored_entry.seek_to_data_offset(&mut cursor).await?; + + Ok(ZipEntryReader::new_with_owned( + cursor, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + )) + } + + /// Returns a new entry reader if the provided index is valid. + pub async fn reader_with_entry(&self, index: usize) -> Result, WithEntry<'_>>> { + let stored_entry = self.inner.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + let mut cursor = Cursor::new(&self.inner.data[..]); + + stored_entry.seek_to_data_offset(&mut cursor).await?; + + let reader = ZipEntryReader::new_with_owned( + cursor, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + ); + + Ok(reader.into_with_entry(stored_entry)) + } +} diff --git a/crates/async_zip/src/base/read/mod.rs b/crates/async_zip/src/base/read/mod.rs new file mode 100644 index 0000000..e07cd16 --- /dev/null +++ b/crates/async_zip/src/base/read/mod.rs @@ -0,0 +1,320 @@ +// Copyright (c) 2022-2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A module which supports reading ZIP files. + +pub mod mem; +pub mod seek; +pub mod stream; + +pub(crate) mod io; + +use crate::ZipString; +// Re-exported as part of the public API. +pub use crate::base::read::io::entry::WithEntry; +pub use crate::base::read::io::entry::WithoutEntry; +pub use crate::base::read::io::entry::ZipEntryReader; + +use crate::date::ZipDateTime; +use crate::entry::{StoredZipEntry, ZipEntry}; +use crate::error::{Result, ZipError}; +use crate::file::ZipFile; +use crate::spec::attribute::AttributeCompatibility; +use crate::spec::consts::LFH_LENGTH; +use crate::spec::consts::{CDH_SIGNATURE, LFH_SIGNATURE, NON_ZIP64_MAX_SIZE, SIGNATURE_LENGTH, ZIP64_EOCDL_LENGTH}; +use crate::spec::header::InfoZipUnicodeCommentExtraField; +use crate::spec::header::InfoZipUnicodePathExtraField; +use crate::spec::header::{ + CentralDirectoryRecord, EndOfCentralDirectoryHeader, ExtraField, LocalFileHeader, + Zip64EndOfCentralDirectoryLocator, Zip64EndOfCentralDirectoryRecord, Zip64ExtendedInformationExtraField, +}; +use crate::spec::Compression; +use crate::string::StringEncoding; + +use crate::base::read::io::CombinedCentralDirectoryRecord; +use crate::spec::parse::parse_extra_fields; + +use futures_lite::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, SeekFrom}; + +pub(crate) async fn file(mut reader: R) -> Result +where + R: AsyncRead + AsyncSeek + Unpin, +{ + // First find and parse the EOCDR. + let eocdr_offset = crate::base::read::io::locator::eocdr(&mut reader).await?; + + reader.seek(SeekFrom::Start(eocdr_offset)).await?; + let eocdr = EndOfCentralDirectoryHeader::from_reader(&mut reader).await?; + + let comment = io::read_string(&mut reader, eocdr.file_comm_length.into(), crate::StringEncoding::Utf8).await?; + + // Check the 20 bytes before the EOCDR for the Zip64 EOCDL, plus an extra 4 bytes because the offset + // does not include the signature. If the ECODL exists we are dealing with a Zip64 file. + let (eocdr, zip64) = match eocdr_offset.checked_sub(ZIP64_EOCDL_LENGTH + SIGNATURE_LENGTH as u64) { + None => (CombinedCentralDirectoryRecord::from(&eocdr), false), + Some(offset) => { + reader.seek(SeekFrom::Start(offset)).await?; + let zip64_locator = Zip64EndOfCentralDirectoryLocator::try_from_reader(&mut reader).await?; + + match zip64_locator { + Some(locator) => { + reader.seek(SeekFrom::Start(locator.relative_offset + SIGNATURE_LENGTH as u64)).await?; + let zip64_eocdr = Zip64EndOfCentralDirectoryRecord::from_reader(&mut reader).await?; + (CombinedCentralDirectoryRecord::combine(eocdr, zip64_eocdr), true) + } + None => (CombinedCentralDirectoryRecord::from(&eocdr), false), + } + } + }; + + // Outdated feature so unlikely to ever make it into this crate. + if eocdr.disk_number != eocdr.disk_number_start_of_cd + || eocdr.num_entries_in_directory != eocdr.num_entries_in_directory_on_disk + { + return Err(ZipError::FeatureNotSupported("Spanned/split files")); + } + + // Find and parse the central directory. + reader.seek(SeekFrom::Start(eocdr.offset_of_start_of_directory)).await?; + let entries = crate::base::read::cd(reader, eocdr.num_entries_in_directory, zip64).await?; + + Ok(ZipFile { entries, comment, zip64 }) +} + +pub(crate) async fn cd(mut reader: R, num_of_entries: u64, zip64: bool) -> Result> +where + R: AsyncRead + Unpin, +{ + let num_of_entries = num_of_entries.try_into().map_err(|_| ZipError::TargetZip64NotSupported)?; + let mut entries = Vec::with_capacity(num_of_entries); + + for _ in 0..num_of_entries { + let entry = cd_record(&mut reader, zip64).await?; + entries.push(entry); + } + + Ok(entries) +} + +pub(crate) fn get_zip64_extra_field(extra_fields: &[ExtraField]) -> Option<&Zip64ExtendedInformationExtraField> { + for field in extra_fields { + if let ExtraField::Zip64ExtendedInformation(zip64field) = field { + return Some(zip64field); + } + } + None +} + +pub(crate) fn get_zip64_extra_field_mut( + extra_fields: &mut [ExtraField], +) -> Option<&mut Zip64ExtendedInformationExtraField> { + for field in extra_fields { + if let ExtraField::Zip64ExtendedInformation(zip64field) = field { + return Some(zip64field); + } + } + None +} + +fn get_combined_sizes( + uncompressed_size: u32, + compressed_size: u32, + extra_field: &Option<&Zip64ExtendedInformationExtraField>, +) -> Result<(u64, u64)> { + let mut uncompressed_size = uncompressed_size as u64; + let mut compressed_size = compressed_size as u64; + + if let Some(extra_field) = extra_field { + if let Some(s) = extra_field.uncompressed_size { + uncompressed_size = s; + } + if let Some(s) = extra_field.compressed_size { + compressed_size = s; + } + } + + Ok((uncompressed_size, compressed_size)) +} + +pub(crate) async fn cd_record(mut reader: R, _zip64: bool) -> Result +where + R: AsyncRead + Unpin, +{ + crate::utils::assert_signature(&mut reader, CDH_SIGNATURE).await?; + + let header = CentralDirectoryRecord::from_reader(&mut reader).await?; + let header_size = (SIGNATURE_LENGTH + LFH_LENGTH) as u64; + let trailing_size = header.file_name_length as u64 + header.extra_field_length as u64; + let filename_basic = io::read_bytes(&mut reader, header.file_name_length.into()).await?; + let compression = Compression::try_from(header.compression)?; + let extra_field = io::read_bytes(&mut reader, header.extra_field_length.into()).await?; + let extra_fields = parse_extra_fields(extra_field, header.uncompressed_size, header.compressed_size)?; + let comment_basic = io::read_bytes(reader, header.file_comment_length.into()).await?; + + let zip64_extra_field = get_zip64_extra_field(&extra_fields); + let (uncompressed_size, compressed_size) = + get_combined_sizes(header.uncompressed_size, header.compressed_size, &zip64_extra_field)?; + + let mut file_offset = header.lh_offset as u64; + if let Some(zip64_extra_field) = zip64_extra_field { + if file_offset == NON_ZIP64_MAX_SIZE as u64 { + if let Some(offset) = zip64_extra_field.relative_header_offset { + file_offset = offset; + } + } + } + + let filename = detect_filename(filename_basic, header.flags.filename_unicode, extra_fields.as_ref()); + let comment = detect_comment(comment_basic, header.flags.filename_unicode, extra_fields.as_ref()); + + let entry = ZipEntry { + filename, + compression, + #[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" + ))] + compression_level: async_compression::Level::Default, + attribute_compatibility: AttributeCompatibility::Unix, + // FIXME: Default to Unix for the moment + crc32: header.crc, + uncompressed_size, + compressed_size, + last_modification_date: ZipDateTime { date: header.mod_date, time: header.mod_time }, + internal_file_attribute: header.inter_attr, + external_file_attribute: header.exter_attr, + extra_fields, + comment, + data_descriptor: header.flags.data_descriptor, + }; + + Ok(StoredZipEntry { entry, file_offset, header_size: header_size + trailing_size }) +} + +pub(crate) async fn lfh(mut reader: R) -> Result> +where + R: AsyncRead + Unpin, +{ + let signature = { + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer).await?; + u32::from_le_bytes(buffer) + }; + match signature { + actual if actual == LFH_SIGNATURE => (), + actual if actual == CDH_SIGNATURE => return Ok(None), + actual => return Err(ZipError::UnexpectedHeaderError(actual, LFH_SIGNATURE)), + }; + + let header = LocalFileHeader::from_reader(&mut reader).await?; + let filename_basic = io::read_bytes(&mut reader, header.file_name_length.into()).await?; + let compression = Compression::try_from(header.compression)?; + let extra_field = io::read_bytes(&mut reader, header.extra_field_length.into()).await?; + let extra_fields = parse_extra_fields(extra_field, header.uncompressed_size, header.compressed_size)?; + + let zip64_extra_field = get_zip64_extra_field(&extra_fields); + let (uncompressed_size, compressed_size) = + get_combined_sizes(header.uncompressed_size, header.compressed_size, &zip64_extra_field)?; + + if header.flags.data_descriptor && compression == Compression::Stored { + return Err(ZipError::FeatureNotSupported( + "stream reading entries with data descriptors & Stored compression mode", + )); + } + if header.flags.encrypted { + return Err(ZipError::FeatureNotSupported("encryption")); + } + + let filename = detect_filename(filename_basic, header.flags.filename_unicode, extra_fields.as_ref()); + + let entry = ZipEntry { + filename, + compression, + #[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" + ))] + compression_level: async_compression::Level::Default, + attribute_compatibility: AttributeCompatibility::Unix, + // FIXME: Default to Unix for the moment + crc32: header.crc, + uncompressed_size, + compressed_size, + last_modification_date: ZipDateTime { date: header.mod_date, time: header.mod_time }, + internal_file_attribute: 0, + external_file_attribute: 0, + extra_fields, + comment: String::new().into(), + data_descriptor: header.flags.data_descriptor, + }; + + Ok(Some(entry)) +} + +fn detect_comment(basic: Vec, basic_is_utf8: bool, extra_fields: &[ExtraField]) -> ZipString { + if basic_is_utf8 { + ZipString::new(basic, StringEncoding::Utf8) + } else { + let unicode_extra = extra_fields.iter().find_map(|field| match field { + ExtraField::InfoZipUnicodeComment(InfoZipUnicodeCommentExtraField::V1 { crc32, unicode }) => { + if *crc32 == crc32fast::hash(&basic) { + Some(std::string::String::from_utf8(unicode.clone())) + } else { + None + } + } + _ => None, + }); + if let Some(Ok(s)) = unicode_extra { + ZipString::new_with_alternative(s, basic) + } else { + // Do not treat as UTF-8 if UTF-8 flags are not set, + // some string in MBCS may be valid UTF-8 in form, but they are not in truth. + if basic.is_ascii() { + // SAFETY: + // a valid ASCII string is always a valid UTF-8 string + unsafe { std::string::String::from_utf8_unchecked(basic).into() } + } else { + ZipString::new(basic, StringEncoding::Raw) + } + } + } +} + +fn detect_filename(basic: Vec, basic_is_utf8: bool, extra_fields: &[ExtraField]) -> ZipString { + if basic_is_utf8 { + ZipString::new(basic, StringEncoding::Utf8) + } else { + let unicode_extra = extra_fields.iter().find_map(|field| match field { + ExtraField::InfoZipUnicodePath(InfoZipUnicodePathExtraField::V1 { crc32, unicode }) => { + if *crc32 == crc32fast::hash(&basic) { + Some(std::string::String::from_utf8(unicode.clone())) + } else { + None + } + } + _ => None, + }); + if let Some(Ok(s)) = unicode_extra { + ZipString::new_with_alternative(s, basic) + } else { + // Do not treat as UTF-8 if UTF-8 flags are not set, + // some string in MBCS may be valid UTF-8 in form, but they are not in truth. + if basic.is_ascii() { + // SAFETY: + // a valid ASCII string is always a valid UTF-8 string + unsafe { std::string::String::from_utf8_unchecked(basic).into() } + } else { + ZipString::new(basic, StringEncoding::Raw) + } + } + } +} diff --git a/crates/async_zip/src/base/read/seek.rs b/crates/async_zip/src/base/read/seek.rs new file mode 100644 index 0000000..bd1f1ab --- /dev/null +++ b/crates/async_zip/src/base/read/seek.rs @@ -0,0 +1,140 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A ZIP reader which acts over a seekable source. +//! +//! ### Example +//! ```no_run +//! # use async_zip::base::read::seek::ZipFileReader; +//! # use async_zip::error::Result; +//! # use futures_lite::io::AsyncReadExt; +//! # use tokio::fs::File; +//! # use tokio_util::compat::TokioAsyncReadCompatExt; +//! # use tokio::io::BufReader; +//! # +//! async fn run() -> Result<()> { +//! let mut data = BufReader::new(File::open("./foo.zip").await?); +//! let mut reader = ZipFileReader::new(data.compat()).await?; +//! +//! let mut data = Vec::new(); +//! let mut entry = reader.reader_without_entry(0).await?; +//! entry.read_to_end(&mut data).await?; +//! +//! // Use data within current scope. +//! +//! Ok(()) +//! } +//! ``` + +use crate::base::read::io::entry::ZipEntryReader; +use crate::error::{Result, ZipError}; +use crate::file::ZipFile; + +#[cfg(feature = "tokio")] +use crate::tokio::read::seek::ZipFileReader as TokioZipFileReader; + +use futures_lite::io::{AsyncBufRead, AsyncSeek}; + +#[cfg(feature = "tokio")] +use tokio_util::compat::{Compat, TokioAsyncReadCompatExt}; + +use super::io::entry::{WithEntry, WithoutEntry}; + +/// A ZIP reader which acts over a seekable source. +#[derive(Clone)] +pub struct ZipFileReader { + reader: R, + file: ZipFile, +} + +impl ZipFileReader +where + R: AsyncBufRead + AsyncSeek + Unpin, +{ + /// Constructs a new ZIP reader from a seekable source. + pub async fn new(mut reader: R) -> Result> { + let file = crate::base::read::file(&mut reader).await?; + Ok(ZipFileReader::from_raw_parts(reader, file)) + } + + /// Constructs a ZIP reader from a seekable source and ZIP file information derived from that source. + /// + /// Providing a [`ZipFile`] that wasn't derived from that source may lead to inaccurate parsing. + pub fn from_raw_parts(reader: R, file: ZipFile) -> ZipFileReader { + ZipFileReader { reader, file } + } + + /// Returns this ZIP file's information. + pub fn file(&self) -> &ZipFile { + &self.file + } + + /// Returns a mutable reference to the inner seekable source. + /// + /// Swapping the source (eg. via std::mem operations) may lead to inaccurate parsing. + pub fn inner_mut(&mut self) -> &mut R { + &mut self.reader + } + + /// Returns the inner seekable source by consuming self. + pub fn into_inner(self) -> R { + self.reader + } + + /// Returns a new entry reader if the provided index is valid. + pub async fn reader_without_entry(&mut self, index: usize) -> Result> { + let stored_entry = self.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + stored_entry.seek_to_data_offset(&mut self.reader).await?; + + Ok(ZipEntryReader::new_with_borrow( + &mut self.reader, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + )) + } + + /// Returns a new entry reader if the provided index is valid. + pub async fn reader_with_entry(&mut self, index: usize) -> Result>> { + let stored_entry = self.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + + stored_entry.seek_to_data_offset(&mut self.reader).await?; + + let reader = ZipEntryReader::new_with_borrow( + &mut self.reader, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + ); + + Ok(reader.into_with_entry(stored_entry)) + } + + /// Returns a new entry reader if the provided index is valid. + /// Consumes self + pub async fn into_entry<'a>(mut self, index: usize) -> Result> + where + R: 'a, + { + let stored_entry = self.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + + stored_entry.seek_to_data_offset(&mut self.reader).await?; + + Ok(ZipEntryReader::new_with_owned( + self.reader, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + )) + } +} + +#[cfg(feature = "tokio")] +impl ZipFileReader> +where + R: tokio::io::AsyncBufRead + tokio::io::AsyncSeek + Unpin, +{ + /// Constructs a new tokio-specific ZIP reader from a seekable source. + pub async fn with_tokio(reader: R) -> Result> { + let mut reader = reader.compat(); + let file = crate::base::read::file(&mut reader).await?; + Ok(ZipFileReader::from_raw_parts(reader, file)) + } +} diff --git a/crates/async_zip/src/base/read/stream.rs b/crates/async_zip/src/base/read/stream.rs new file mode 100644 index 0000000..d276941 --- /dev/null +++ b/crates/async_zip/src/base/read/stream.rs @@ -0,0 +1,174 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A ZIP reader which acts over a non-seekable source. +//! +//! # API Design +//! As opposed to other readers provided by this crate, it's important that the data of an entry is fully read before +//! the proceeding entry is read. This is as a result of not being able to seek forwards or backwards, so we must end +//! up at the start of the next entry. +//! +//! **We encode this invariant within Rust's type system so that it can be enforced at compile time.** +//! +//! This requires that any transition methods between these encoded types consume the reader and provide a new owned +//! reader back. This is certainly something to keep in mind when working with this reader, but idiomatic code can +//! still be produced nevertheless. +//! +//! # Considerations +//! As the central directory of a ZIP archive is stored at the end of it, a non-seekable reader doesn't have access +//! to it. We have to rely on information provided within the local file header which may not be accurate or complete. +//! This results in: +//! - The inability to read ZIP entries using the combination of a data descriptor and the Stored compression method. +//! - No file comment being available (defaults to an empty string). +//! - No internal or external file attributes being available (defaults to 0). +//! - The extra field data potentially being inconsistent with what's stored in the central directory. +//! - None of the following being available when the entry was written with a data descriptor (defaults to 0): +//! - CRC +//! - compressed size +//! - uncompressed size +//! +//! # Example +//! ```no_run +//! # use futures_lite::io::Cursor; +//! # use async_zip::error::Result; +//! # use async_zip::base::read::stream::ZipFileReader; +//! # +//! # async fn run() -> Result<()> { +//! let mut zip = ZipFileReader::new(Cursor::new([0; 0])); +//! +//! // Print the name of every file in a ZIP archive. +//! while let Some(entry) = zip.next_with_entry().await? { +//! println!("File: {}", entry.reader().entry().filename().as_str().unwrap()); +//! zip = entry.skip().await?; +//! } +//! # +//! # Ok(()) +//! # } +//! ``` + +use super::io::ConsumeDataDescriptor; + +use crate::base::read::io::entry::ZipEntryReader; +use crate::error::Result; +use crate::error::ZipError; + +#[cfg(feature = "tokio")] +use crate::tokio::read::stream::Ready as TokioReady; + +use futures_lite::io::AsyncBufRead; +use futures_lite::io::AsyncReadExt; + +#[cfg(feature = "tokio")] +use tokio_util::compat::TokioAsyncReadCompatExt; + +use super::io::entry::WithEntry; +use super::io::entry::WithoutEntry; + +/// A type which encodes that [`ZipFileReader`] is ready to open a new entry. +pub struct Ready(R); + +/// A type which encodes that [`ZipFileReader`] is currently reading an entry. +pub struct Reading<'a, R, E>(ZipEntryReader<'a, R, E>, bool); + +/// A ZIP reader which acts over a non-seekable source. +/// +/// See the [module-level docs](.) for more information. +#[derive(Clone)] +pub struct ZipFileReader(S); + +impl<'a, R> ZipFileReader> +where + R: AsyncBufRead + Unpin + 'a, +{ + /// Constructs a new ZIP reader from a non-seekable source. + pub fn new(reader: R) -> Self { + Self(Ready(reader)) + } + + /// Opens the next entry for reading if the central directory hasn’t yet been reached. + pub async fn next_without_entry(mut self) -> Result>>> { + let entry = match crate::base::read::lfh(&mut self.0 .0).await? { + Some(entry) => entry, + None => return Ok(None), + }; + + let length = if entry.data_descriptor { u64::MAX } else { entry.compressed_size }; + let reader = ZipEntryReader::new_with_owned(self.0 .0, entry.compression, length); + + Ok(Some(ZipFileReader(Reading(reader, entry.data_descriptor)))) + } + + /// Opens the next entry for reading if the central directory hasn’t yet been reached. + pub async fn next_with_entry(mut self) -> Result>>>> { + let entry = match crate::base::read::lfh(&mut self.0 .0).await? { + Some(entry) => entry, + None => return Ok(None), + }; + + let length = if entry.data_descriptor { u64::MAX } else { entry.compressed_size }; + let reader = ZipEntryReader::new_with_owned(self.0 .0, entry.compression, length); + let data_descriptor = entry.data_descriptor; + + Ok(Some(ZipFileReader(Reading(reader.into_with_entry_owned(entry), data_descriptor)))) + } + + /// Consumes the `ZipFileReader` returning the original `reader` + pub async fn into_inner(self) -> R { + self.0 .0 + } +} + +#[cfg(feature = "tokio")] +impl ZipFileReader> +where + R: tokio::io::AsyncBufRead + Unpin, +{ + /// Constructs a new tokio-specific ZIP reader from a non-seekable source. + pub fn with_tokio(reader: R) -> ZipFileReader> { + Self(Ready(reader.compat())) + } +} + +impl<'a, R, E> ZipFileReader> +where + R: AsyncBufRead + Unpin, +{ + /// Returns an immutable reference to the inner entry reader. + pub fn reader(&self) -> &ZipEntryReader<'a, R, E> { + &self.0 .0 + } + + /// Returns a mutable reference to the inner entry reader. + pub fn reader_mut(&mut self) -> &mut ZipEntryReader<'a, R, E> { + &mut self.0 .0 + } + + /// Converts the reader back into the Ready state if EOF has been reached. + pub async fn done(mut self) -> Result>> { + if self.0 .0.read(&mut [0; 1]).await? != 0 { + return Err(ZipError::EOFNotReached); + } + + let mut inner = self.0 .0.into_inner(); + + // Has data descriptor. + if self.0 .1 { + ConsumeDataDescriptor(&mut inner).await?; + } + + Ok(ZipFileReader(Ready(inner))) + } + + /// Reads until EOF and converts the reader back into the Ready state. + pub async fn skip(mut self) -> Result>> { + while self.0 .0.read(&mut [0; 2048]).await? != 0 {} + let mut inner = self.0 .0.into_inner(); + + // Has data descriptor. + if self.0 .1 { + ConsumeDataDescriptor(&mut inner).await?; + } + + Ok(ZipFileReader(Ready(inner))) + } +} diff --git a/crates/async_zip/src/base/write/compressed_writer.rs b/crates/async_zip/src/base/write/compressed_writer.rs new file mode 100644 index 0000000..3b71421 --- /dev/null +++ b/crates/async_zip/src/base/write/compressed_writer.rs @@ -0,0 +1,137 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::write::io::offset::AsyncOffsetWriter; +use crate::spec::Compression; + +use std::io::Error; +use std::pin::Pin; +use std::task::{Context, Poll}; + +#[cfg(any(feature = "deflate", feature = "bzip2", feature = "zstd", feature = "lzma", feature = "xz"))] +use async_compression::futures::write; +use futures_lite::io::AsyncWrite; + +pub enum CompressedAsyncWriter<'b, W: AsyncWrite + Unpin> { + Stored(ShutdownIgnoredWriter<&'b mut AsyncOffsetWriter>), + #[cfg(feature = "deflate")] + Deflate(write::DeflateEncoder>>), + #[cfg(feature = "bzip2")] + Bz(write::BzEncoder>>), + #[cfg(feature = "lzma")] + Lzma(write::LzmaEncoder>>), + #[cfg(feature = "zstd")] + Zstd(write::ZstdEncoder>>), + #[cfg(feature = "xz")] + Xz(write::XzEncoder>>), +} + +impl<'b, W: AsyncWrite + Unpin> CompressedAsyncWriter<'b, W> { + pub fn from_raw(writer: &'b mut AsyncOffsetWriter, compression: Compression) -> Self { + match compression { + Compression::Stored => CompressedAsyncWriter::Stored(ShutdownIgnoredWriter(writer)), + #[cfg(feature = "deflate")] + Compression::Deflate => { + CompressedAsyncWriter::Deflate(write::DeflateEncoder::new(ShutdownIgnoredWriter(writer))) + } + #[cfg(feature = "deflate64")] + Compression::Deflate64 => panic!("writing deflate64 is not supported"), + #[cfg(feature = "bzip2")] + Compression::Bz => CompressedAsyncWriter::Bz(write::BzEncoder::new(ShutdownIgnoredWriter(writer))), + #[cfg(feature = "lzma")] + Compression::Lzma => CompressedAsyncWriter::Lzma(write::LzmaEncoder::new(ShutdownIgnoredWriter(writer))), + #[cfg(feature = "zstd")] + Compression::Zstd => CompressedAsyncWriter::Zstd(write::ZstdEncoder::new(ShutdownIgnoredWriter(writer))), + #[cfg(feature = "xz")] + Compression::Xz => CompressedAsyncWriter::Xz(write::XzEncoder::new(ShutdownIgnoredWriter(writer))), + } + } + + pub fn into_inner(self) -> &'b mut AsyncOffsetWriter { + match self { + CompressedAsyncWriter::Stored(inner) => inner.into_inner(), + #[cfg(feature = "deflate")] + CompressedAsyncWriter::Deflate(inner) => inner.into_inner().into_inner(), + #[cfg(feature = "bzip2")] + CompressedAsyncWriter::Bz(inner) => inner.into_inner().into_inner(), + #[cfg(feature = "lzma")] + CompressedAsyncWriter::Lzma(inner) => inner.into_inner().into_inner(), + #[cfg(feature = "zstd")] + CompressedAsyncWriter::Zstd(inner) => inner.into_inner().into_inner(), + #[cfg(feature = "xz")] + CompressedAsyncWriter::Xz(inner) => inner.into_inner().into_inner(), + } + } +} + +impl<'b, W: AsyncWrite + Unpin> AsyncWrite for CompressedAsyncWriter<'b, W> { + fn poll_write(mut self: Pin<&mut Self>, cx: &mut Context, buf: &[u8]) -> Poll> { + match *self { + CompressedAsyncWriter::Stored(ref mut inner) => Pin::new(inner).poll_write(cx, buf), + #[cfg(feature = "deflate")] + CompressedAsyncWriter::Deflate(ref mut inner) => Pin::new(inner).poll_write(cx, buf), + #[cfg(feature = "bzip2")] + CompressedAsyncWriter::Bz(ref mut inner) => Pin::new(inner).poll_write(cx, buf), + #[cfg(feature = "lzma")] + CompressedAsyncWriter::Lzma(ref mut inner) => Pin::new(inner).poll_write(cx, buf), + #[cfg(feature = "zstd")] + CompressedAsyncWriter::Zstd(ref mut inner) => Pin::new(inner).poll_write(cx, buf), + #[cfg(feature = "xz")] + CompressedAsyncWriter::Xz(ref mut inner) => Pin::new(inner).poll_write(cx, buf), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + match *self { + CompressedAsyncWriter::Stored(ref mut inner) => Pin::new(inner).poll_flush(cx), + #[cfg(feature = "deflate")] + CompressedAsyncWriter::Deflate(ref mut inner) => Pin::new(inner).poll_flush(cx), + #[cfg(feature = "bzip2")] + CompressedAsyncWriter::Bz(ref mut inner) => Pin::new(inner).poll_flush(cx), + #[cfg(feature = "lzma")] + CompressedAsyncWriter::Lzma(ref mut inner) => Pin::new(inner).poll_flush(cx), + #[cfg(feature = "zstd")] + CompressedAsyncWriter::Zstd(ref mut inner) => Pin::new(inner).poll_flush(cx), + #[cfg(feature = "xz")] + CompressedAsyncWriter::Xz(ref mut inner) => Pin::new(inner).poll_flush(cx), + } + } + + fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + match *self { + CompressedAsyncWriter::Stored(ref mut inner) => Pin::new(inner).poll_close(cx), + #[cfg(feature = "deflate")] + CompressedAsyncWriter::Deflate(ref mut inner) => Pin::new(inner).poll_close(cx), + #[cfg(feature = "bzip2")] + CompressedAsyncWriter::Bz(ref mut inner) => Pin::new(inner).poll_close(cx), + #[cfg(feature = "lzma")] + CompressedAsyncWriter::Lzma(ref mut inner) => Pin::new(inner).poll_close(cx), + #[cfg(feature = "zstd")] + CompressedAsyncWriter::Zstd(ref mut inner) => Pin::new(inner).poll_close(cx), + #[cfg(feature = "xz")] + CompressedAsyncWriter::Xz(ref mut inner) => Pin::new(inner).poll_close(cx), + } + } +} + +pub struct ShutdownIgnoredWriter(W); + +impl ShutdownIgnoredWriter { + pub fn into_inner(self) -> W { + self.0 + } +} + +impl AsyncWrite for ShutdownIgnoredWriter { + fn poll_write(mut self: Pin<&mut Self>, cx: &mut Context, buf: &[u8]) -> Poll> { + Pin::new(&mut self.0).poll_write(cx, buf) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + Pin::new(&mut self.0).poll_flush(cx) + } + + fn poll_close(self: Pin<&mut Self>, _: &mut Context) -> Poll> { + Poll::Ready(Ok(())) + } +} diff --git a/crates/async_zip/src/base/write/entry_stream.rs b/crates/async_zip/src/base/write/entry_stream.rs new file mode 100644 index 0000000..cc41f0e --- /dev/null +++ b/crates/async_zip/src/base/write/entry_stream.rs @@ -0,0 +1,272 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::write::compressed_writer::CompressedAsyncWriter; +use crate::base::write::get_or_put_info_zip_unicode_comment_extra_field_mut; +use crate::base::write::get_or_put_info_zip_unicode_path_extra_field_mut; +use crate::base::write::io::offset::AsyncOffsetWriter; +use crate::base::write::CentralDirectoryEntry; +use crate::base::write::ZipFileWriter; +use crate::entry::ZipEntry; +use crate::error::{Result, Zip64ErrorCase, ZipError}; +use crate::spec::extra_field::ExtraFieldAsBytes; +use crate::spec::header::InfoZipUnicodeCommentExtraField; +use crate::spec::header::InfoZipUnicodePathExtraField; +use crate::spec::header::{ + CentralDirectoryRecord, ExtraField, GeneralPurposeFlag, HeaderId, LocalFileHeader, + Zip64ExtendedInformationExtraField, +}; +use crate::string::StringEncoding; + +use std::io::Error; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use crate::base::read::get_zip64_extra_field_mut; +use crate::spec::consts::{NON_ZIP64_MAX_NUM_FILES, NON_ZIP64_MAX_SIZE}; +use crc32fast::Hasher; +use futures_lite::io::{AsyncWrite, AsyncWriteExt}; + +/// An entry writer which supports the streaming of data (ie. the writing of unknown size or data at runtime). +/// +/// # Note +/// - This writer cannot be manually constructed; instead, use [`ZipFileWriter::write_entry_stream()`]. +/// - [`EntryStreamWriter::close()`] must be called before a stream writer goes out of scope. +/// - Utilities for working with [`AsyncWrite`] values are provided by [`AsyncWriteExt`]. +pub struct EntryStreamWriter<'b, W: AsyncWrite + Unpin> { + writer: AsyncOffsetWriter>, + cd_entries: &'b mut Vec, + entry: ZipEntry, + hasher: Hasher, + lfh: LocalFileHeader, + lfh_offset: u64, + data_offset: u64, + force_no_zip64: bool, + /// To write back to the original writer if zip64 is required. + is_zip64: &'b mut bool, +} + +impl<'b, W: AsyncWrite + Unpin> EntryStreamWriter<'b, W> { + pub(crate) async fn from_raw( + writer: &'b mut ZipFileWriter, + mut entry: ZipEntry, + ) -> Result> { + let lfh_offset = writer.writer.offset(); + let lfh = EntryStreamWriter::write_lfh(writer, &mut entry).await?; + let data_offset = writer.writer.offset(); + let force_no_zip64 = writer.force_no_zip64; + + let cd_entries = &mut writer.cd_entries; + let is_zip64 = &mut writer.is_zip64; + let writer = AsyncOffsetWriter::new(CompressedAsyncWriter::from_raw(&mut writer.writer, entry.compression())); + + Ok(EntryStreamWriter { + writer, + cd_entries, + entry, + lfh, + lfh_offset, + data_offset, + hasher: Hasher::new(), + force_no_zip64, + is_zip64, + }) + } + + async fn write_lfh(writer: &'b mut ZipFileWriter, entry: &mut ZipEntry) -> Result { + // Always emit a zip64 extended field, even if we don't need it, because we *might* need it. + // If we are forcing no zip, we will have to error later if the file is too large. + let (lfh_compressed, lfh_uncompressed) = if !writer.force_no_zip64 { + if !writer.is_zip64 { + writer.is_zip64 = true; + } + entry.extra_fields.push(ExtraField::Zip64ExtendedInformation(Zip64ExtendedInformationExtraField { + header_id: HeaderId::ZIP64_EXTENDED_INFORMATION_EXTRA_FIELD, + uncompressed_size: Some(entry.uncompressed_size), + compressed_size: Some(entry.compressed_size), + relative_header_offset: None, + disk_start_number: None, + })); + + (NON_ZIP64_MAX_SIZE, NON_ZIP64_MAX_SIZE) + } else { + if entry.compressed_size > NON_ZIP64_MAX_SIZE as u64 || entry.uncompressed_size > NON_ZIP64_MAX_SIZE as u64 + { + return Err(ZipError::Zip64Needed(Zip64ErrorCase::LargeFile)); + } + + (entry.compressed_size as u32, entry.uncompressed_size as u32) + }; + + let utf8_without_alternative = + entry.filename().is_utf8_without_alternative() && entry.comment().is_utf8_without_alternative(); + if !utf8_without_alternative { + if matches!(entry.filename().encoding(), StringEncoding::Utf8) { + let u_file_name = entry.filename().as_bytes().to_vec(); + if !u_file_name.is_empty() { + let basic_crc32 = + crc32fast::hash(entry.filename().alternative().unwrap_or_else(|| entry.filename().as_bytes())); + let upath_field = get_or_put_info_zip_unicode_path_extra_field_mut(entry.extra_fields.as_mut()); + if let InfoZipUnicodePathExtraField::V1 { crc32, unicode } = upath_field { + *crc32 = basic_crc32; + *unicode = u_file_name; + } + } + } + if matches!(entry.comment().encoding(), StringEncoding::Utf8) { + let u_comment = entry.comment().as_bytes().to_vec(); + if !u_comment.is_empty() { + let basic_crc32 = + crc32fast::hash(entry.comment().alternative().unwrap_or_else(|| entry.comment().as_bytes())); + let ucom_field = get_or_put_info_zip_unicode_comment_extra_field_mut(entry.extra_fields.as_mut()); + if let InfoZipUnicodeCommentExtraField::V1 { crc32, unicode } = ucom_field { + *crc32 = basic_crc32; + *unicode = u_comment; + } + } + } + } + + let filename_basic = entry.filename().alternative().unwrap_or_else(|| entry.filename().as_bytes()); + + let lfh = LocalFileHeader { + compressed_size: lfh_compressed, + uncompressed_size: lfh_uncompressed, + compression: entry.compression().into(), + crc: entry.crc32, + extra_field_length: entry + .extra_fields() + .count_bytes() + .try_into() + .map_err(|_| ZipError::ExtraFieldTooLarge)?, + file_name_length: filename_basic.len().try_into().map_err(|_| ZipError::FileNameTooLarge)?, + mod_time: entry.last_modification_date().time, + mod_date: entry.last_modification_date().date, + version: crate::spec::version::as_needed_to_extract(entry), + flags: GeneralPurposeFlag { + data_descriptor: true, + encrypted: false, + filename_unicode: utf8_without_alternative, + }, + }; + + writer.writer.write_all(&crate::spec::consts::LFH_SIGNATURE.to_le_bytes()).await?; + writer.writer.write_all(&lfh.as_slice()).await?; + writer.writer.write_all(filename_basic).await?; + writer.writer.write_all(&entry.extra_fields().as_bytes()).await?; + + Ok(lfh) + } + + /// Consumes this entry writer and completes all closing tasks. + /// + /// This includes: + /// - Finalising the CRC32 hash value for the written data. + /// - Calculating the compressed and uncompressed byte sizes. + /// - Constructing a central directory header. + /// - Pushing that central directory header to the [`ZipFileWriter`]'s store. + /// + /// Failure to call this function before going out of scope would result in a corrupted ZIP file. + pub async fn close(mut self) -> Result<()> { + self.writer.close().await?; + + let crc = self.hasher.finalize(); + let uncompressed_size = self.writer.offset(); + let inner_writer = self.writer.into_inner().into_inner(); + let compressed_size = inner_writer.offset() - self.data_offset; + + let (cdr_compressed_size, cdr_uncompressed_size, lh_offset) = if self.force_no_zip64 { + if uncompressed_size > NON_ZIP64_MAX_SIZE as u64 + || compressed_size > NON_ZIP64_MAX_SIZE as u64 + || self.lfh_offset > NON_ZIP64_MAX_SIZE as u64 + { + return Err(ZipError::Zip64Needed(Zip64ErrorCase::LargeFile)); + } + (uncompressed_size as u32, compressed_size as u32, self.lfh_offset as u32) + } else { + // When streaming an entry, we are always using a zip64 field. + match get_zip64_extra_field_mut(&mut self.entry.extra_fields) { + // This case shouldn't be necessary but is included for completeness. + None => { + self.entry.extra_fields.push(ExtraField::Zip64ExtendedInformation( + Zip64ExtendedInformationExtraField { + header_id: HeaderId::ZIP64_EXTENDED_INFORMATION_EXTRA_FIELD, + uncompressed_size: Some(uncompressed_size), + compressed_size: Some(compressed_size), + relative_header_offset: Some(self.lfh_offset), + disk_start_number: None, + }, + )); + } + Some(zip64) => { + zip64.uncompressed_size = Some(uncompressed_size); + zip64.compressed_size = Some(compressed_size); + zip64.relative_header_offset = Some(self.lfh_offset); + } + } + self.lfh.extra_field_length = + self.entry.extra_fields().count_bytes().try_into().map_err(|_| ZipError::ExtraFieldTooLarge)?; + + (NON_ZIP64_MAX_SIZE, NON_ZIP64_MAX_SIZE, NON_ZIP64_MAX_SIZE) + }; + + inner_writer.write_all(&crate::spec::consts::DATA_DESCRIPTOR_SIGNATURE.to_le_bytes()).await?; + inner_writer.write_all(&crc.to_le_bytes()).await?; + inner_writer.write_all(&cdr_compressed_size.to_le_bytes()).await?; + inner_writer.write_all(&cdr_uncompressed_size.to_le_bytes()).await?; + + let comment_basic = self.entry.comment().alternative().unwrap_or_else(|| self.entry.comment().as_bytes()); + + let cdh = CentralDirectoryRecord { + compressed_size: cdr_compressed_size, + uncompressed_size: cdr_uncompressed_size, + crc, + v_made_by: crate::spec::version::as_made_by(), + v_needed: self.lfh.version, + compression: self.lfh.compression, + extra_field_length: self.lfh.extra_field_length, + file_name_length: self.lfh.file_name_length, + file_comment_length: comment_basic.len().try_into().map_err(|_| ZipError::CommentTooLarge)?, + mod_time: self.lfh.mod_time, + mod_date: self.lfh.mod_date, + flags: self.lfh.flags, + disk_start: 0, + inter_attr: self.entry.internal_file_attribute(), + exter_attr: self.entry.external_file_attribute(), + lh_offset, + }; + + self.cd_entries.push(CentralDirectoryEntry { header: cdh, entry: self.entry }); + // Ensure that we can fit this many files in this archive if forcing no zip64 + if self.cd_entries.len() > NON_ZIP64_MAX_NUM_FILES as usize { + if self.force_no_zip64 { + return Err(ZipError::Zip64Needed(Zip64ErrorCase::TooManyFiles)); + } + if !*self.is_zip64 { + *self.is_zip64 = true; + } + } + + Ok(()) + } +} + +impl<'a, W: AsyncWrite + Unpin> AsyncWrite for EntryStreamWriter<'a, W> { + fn poll_write(mut self: Pin<&mut Self>, cx: &mut Context, buf: &[u8]) -> Poll> { + let poll = Pin::new(&mut self.writer).poll_write(cx, buf); + + if let Poll::Ready(Ok(written)) = poll { + self.hasher.update(&buf[0..written]); + } + + poll + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + Pin::new(&mut self.writer).poll_flush(cx) + } + + fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + Pin::new(&mut self.writer).poll_close(cx) + } +} diff --git a/crates/async_zip/src/base/write/entry_whole.rs b/crates/async_zip/src/base/write/entry_whole.rs new file mode 100644 index 0000000..34594b6 --- /dev/null +++ b/crates/async_zip/src/base/write/entry_whole.rs @@ -0,0 +1,259 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::write::get_or_put_info_zip_unicode_comment_extra_field_mut; +use crate::base::write::get_or_put_info_zip_unicode_path_extra_field_mut; +use crate::base::write::{CentralDirectoryEntry, ZipFileWriter}; +use crate::entry::ZipEntry; +use crate::error::{Result, Zip64ErrorCase, ZipError}; +use crate::spec::extra_field::Zip64ExtendedInformationExtraFieldBuilder; +use crate::spec::header::{InfoZipUnicodeCommentExtraField, InfoZipUnicodePathExtraField}; +use crate::spec::{ + extra_field::ExtraFieldAsBytes, + header::{CentralDirectoryRecord, ExtraField, GeneralPurposeFlag, LocalFileHeader}, + Compression, +}; +use crate::StringEncoding; +#[cfg(any(feature = "deflate", feature = "bzip2", feature = "zstd", feature = "lzma", feature = "xz"))] +use futures_lite::io::Cursor; + +use crate::spec::consts::{NON_ZIP64_MAX_NUM_FILES, NON_ZIP64_MAX_SIZE}; +#[cfg(any(feature = "deflate", feature = "bzip2", feature = "zstd", feature = "lzma", feature = "xz"))] +use async_compression::futures::write; +use futures_lite::io::{AsyncWrite, AsyncWriteExt}; + +pub struct EntryWholeWriter<'b, 'c, W: AsyncWrite + Unpin> { + writer: &'b mut ZipFileWriter, + entry: ZipEntry, + data: &'c [u8], +} + +impl<'b, 'c, W: AsyncWrite + Unpin> EntryWholeWriter<'b, 'c, W> { + pub fn from_raw(writer: &'b mut ZipFileWriter, entry: ZipEntry, data: &'c [u8]) -> Self { + Self { writer, entry, data } + } + + pub async fn write(mut self) -> Result<()> { + let mut _compressed_data: Option> = None; + let compressed_data = match self.entry.compression() { + Compression::Stored => self.data, + #[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" + ))] + _ => { + _compressed_data = + Some(compress(self.entry.compression(), self.data, self.entry.compression_level).await); + _compressed_data.as_ref().unwrap() + } + }; + + let mut zip64_extra_field_builder = None; + + let (lfh_uncompressed_size, lfh_compressed_size) = if self.data.len() as u64 > NON_ZIP64_MAX_SIZE as u64 + || compressed_data.len() as u64 > NON_ZIP64_MAX_SIZE as u64 + { + if self.writer.force_no_zip64 { + return Err(ZipError::Zip64Needed(Zip64ErrorCase::LargeFile)); + } + if !self.writer.is_zip64 { + self.writer.is_zip64 = true; + } + zip64_extra_field_builder = Some( + Zip64ExtendedInformationExtraFieldBuilder::new() + .sizes(compressed_data.len() as u64, self.data.len() as u64), + ); + (NON_ZIP64_MAX_SIZE, NON_ZIP64_MAX_SIZE) + } else { + (self.data.len() as u32, compressed_data.len() as u32) + }; + + let lh_offset = if self.writer.writer.offset() > NON_ZIP64_MAX_SIZE as u64 { + if self.writer.force_no_zip64 { + return Err(ZipError::Zip64Needed(Zip64ErrorCase::LargeFile)); + } + if !self.writer.is_zip64 { + self.writer.is_zip64 = true; + } + + if let Some(zip64_extra_field) = zip64_extra_field_builder { + zip64_extra_field_builder = Some(zip64_extra_field.relative_header_offset(self.writer.writer.offset())); + } else { + zip64_extra_field_builder = Some( + Zip64ExtendedInformationExtraFieldBuilder::new() + .relative_header_offset(self.writer.writer.offset()), + ); + } + NON_ZIP64_MAX_SIZE + } else { + self.writer.writer.offset() as u32 + }; + + if let Some(builder) = zip64_extra_field_builder { + if !builder.eof_only() { + self.entry.extra_fields.push(ExtraField::Zip64ExtendedInformation(builder.build()?)); + zip64_extra_field_builder = None; + } else { + zip64_extra_field_builder = Some(builder); + } + } + + let utf8_without_alternative = + self.entry.filename().is_utf8_without_alternative() && self.entry.comment().is_utf8_without_alternative(); + if !utf8_without_alternative { + if matches!(self.entry.filename().encoding(), StringEncoding::Utf8) { + let u_file_name = self.entry.filename().as_bytes().to_vec(); + if !u_file_name.is_empty() { + let basic_crc32 = crc32fast::hash( + self.entry.filename().alternative().unwrap_or_else(|| self.entry.filename().as_bytes()), + ); + let upath_field = + get_or_put_info_zip_unicode_path_extra_field_mut(self.entry.extra_fields.as_mut()); + if let InfoZipUnicodePathExtraField::V1 { crc32, unicode } = upath_field { + *crc32 = basic_crc32; + *unicode = u_file_name; + } + } + } + if matches!(self.entry.comment().encoding(), StringEncoding::Utf8) { + let u_comment = self.entry.comment().as_bytes().to_vec(); + if !u_comment.is_empty() { + let basic_crc32 = crc32fast::hash( + self.entry.comment().alternative().unwrap_or_else(|| self.entry.comment().as_bytes()), + ); + let ucom_field = + get_or_put_info_zip_unicode_comment_extra_field_mut(self.entry.extra_fields.as_mut()); + if let InfoZipUnicodeCommentExtraField::V1 { crc32, unicode } = ucom_field { + *crc32 = basic_crc32; + *unicode = u_comment; + } + } + } + } + + let filename_basic = self.entry.filename().alternative().unwrap_or_else(|| self.entry.filename().as_bytes()); + let comment_basic = self.entry.comment().alternative().unwrap_or_else(|| self.entry.comment().as_bytes()); + + let lf_header = LocalFileHeader { + compressed_size: lfh_compressed_size, + uncompressed_size: lfh_uncompressed_size, + compression: self.entry.compression().into(), + crc: crc32fast::hash(self.data), + extra_field_length: self + .entry + .extra_fields() + .count_bytes() + .try_into() + .map_err(|_| ZipError::ExtraFieldTooLarge)?, + file_name_length: filename_basic.len().try_into().map_err(|_| ZipError::FileNameTooLarge)?, + mod_time: self.entry.last_modification_date().time, + mod_date: self.entry.last_modification_date().date, + version: crate::spec::version::as_needed_to_extract(&self.entry), + flags: GeneralPurposeFlag { + data_descriptor: false, + encrypted: false, + filename_unicode: utf8_without_alternative, + }, + }; + + let mut header = CentralDirectoryRecord { + v_made_by: crate::spec::version::as_made_by(), + v_needed: lf_header.version, + compressed_size: lf_header.compressed_size, + uncompressed_size: lf_header.uncompressed_size, + compression: lf_header.compression, + crc: lf_header.crc, + extra_field_length: lf_header.extra_field_length, + file_name_length: lf_header.file_name_length, + file_comment_length: comment_basic.len().try_into().map_err(|_| ZipError::CommentTooLarge)?, + mod_time: lf_header.mod_time, + mod_date: lf_header.mod_date, + flags: lf_header.flags, + disk_start: 0, + inter_attr: self.entry.internal_file_attribute(), + exter_attr: self.entry.external_file_attribute(), + lh_offset, + }; + + self.writer.writer.write_all(&crate::spec::consts::LFH_SIGNATURE.to_le_bytes()).await?; + self.writer.writer.write_all(&lf_header.as_slice()).await?; + self.writer.writer.write_all(filename_basic).await?; + self.writer.writer.write_all(&self.entry.extra_fields().as_bytes()).await?; + self.writer.writer.write_all(compressed_data).await?; + + if let Some(builder) = zip64_extra_field_builder { + self.entry.extra_fields.push(ExtraField::Zip64ExtendedInformation(builder.build()?)); + header.extra_field_length = + self.entry.extra_fields().count_bytes().try_into().map_err(|_| ZipError::ExtraFieldTooLarge)?; + } + + self.writer.cd_entries.push(CentralDirectoryEntry { header, entry: self.entry }); + // Ensure that we can fit this many files in this archive if forcing no zip64 + if self.writer.cd_entries.len() > NON_ZIP64_MAX_NUM_FILES as usize { + if self.writer.force_no_zip64 { + return Err(ZipError::Zip64Needed(Zip64ErrorCase::TooManyFiles)); + } + if !self.writer.is_zip64 { + self.writer.is_zip64 = true; + } + } + Ok(()) + } +} + +#[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" +))] +async fn compress(compression: Compression, data: &[u8], level: async_compression::Level) -> Vec { + // TODO: Reduce reallocations of Vec by making a lower-bound estimate of the length reduction and + // pre-initialising the Vec to that length. Then truncate() to the actual number of bytes written. + match compression { + #[cfg(feature = "deflate")] + Compression::Deflate => { + let mut writer = write::DeflateEncoder::with_quality(Cursor::new(Vec::new()), level); + writer.write_all(data).await.unwrap(); + writer.close().await.unwrap(); + writer.into_inner().into_inner() + } + #[cfg(feature = "deflate64")] + Compression::Deflate64 => panic!("compressing deflate64 is not supported"), + #[cfg(feature = "bzip2")] + Compression::Bz => { + let mut writer = write::BzEncoder::with_quality(Cursor::new(Vec::new()), level); + writer.write_all(data).await.unwrap(); + writer.close().await.unwrap(); + writer.into_inner().into_inner() + } + #[cfg(feature = "lzma")] + Compression::Lzma => { + let mut writer = write::LzmaEncoder::with_quality(Cursor::new(Vec::new()), level); + writer.write_all(data).await.unwrap(); + writer.close().await.unwrap(); + writer.into_inner().into_inner() + } + #[cfg(feature = "xz")] + Compression::Xz => { + let mut writer = write::XzEncoder::with_quality(Cursor::new(Vec::new()), level); + writer.write_all(data).await.unwrap(); + writer.close().await.unwrap(); + writer.into_inner().into_inner() + } + #[cfg(feature = "zstd")] + Compression::Zstd => { + let mut writer = write::ZstdEncoder::with_quality(Cursor::new(Vec::new()), level); + writer.write_all(data).await.unwrap(); + writer.close().await.unwrap(); + writer.into_inner().into_inner() + } + _ => unreachable!(), + } +} diff --git a/crates/async_zip/src/base/write/io/mod.rs b/crates/async_zip/src/base/write/io/mod.rs new file mode 100644 index 0000000..326d7d9 --- /dev/null +++ b/crates/async_zip/src/base/write/io/mod.rs @@ -0,0 +1,4 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod offset; diff --git a/crates/async_zip/src/base/write/io/offset.rs b/crates/async_zip/src/base/write/io/offset.rs new file mode 100644 index 0000000..98d3777 --- /dev/null +++ b/crates/async_zip/src/base/write/io/offset.rs @@ -0,0 +1,73 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use std::io::{Error, IoSlice}; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures_lite::io::AsyncWrite; +use pin_project::pin_project; + +/// A wrapper around an [`AsyncWrite`] implementation which tracks the current byte offset. +#[pin_project(project = OffsetWriterProj)] +pub struct AsyncOffsetWriter { + #[pin] + inner: W, + offset: u64, +} + +impl AsyncOffsetWriter +where + W: AsyncWrite + Unpin, +{ + /// Constructs a new wrapper from an inner [`AsyncWrite`] writer. + pub fn new(inner: W) -> Self { + Self { inner, offset: 0 } + } + + /// Returns the current byte offset. + pub fn offset(&self) -> u64 { + self.offset + } + + /// Consumes this wrapper and returns the inner [`AsyncWrite`] writer. + pub fn into_inner(self) -> W { + self.inner + } + + pub fn inner_mut(&mut self) -> &mut W { + &mut self.inner + } +} + +impl AsyncWrite for AsyncOffsetWriter +where + W: AsyncWrite + Unpin, +{ + fn poll_write(self: Pin<&mut Self>, cx: &mut Context, buf: &[u8]) -> Poll> { + let this = self.project(); + let poll = this.inner.poll_write(cx, buf); + + if let Poll::Ready(Ok(inner)) = &poll { + *this.offset += *inner as u64; + } + + poll + } + + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + self.project().inner.poll_flush(cx) + } + + fn poll_close(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + self.project().inner.poll_close(cx) + } + + fn poll_write_vectored( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[IoSlice<'_>], + ) -> Poll> { + self.project().inner.poll_write_vectored(cx, bufs) + } +} diff --git a/crates/async_zip/src/base/write/mod.rs b/crates/async_zip/src/base/write/mod.rs new file mode 100644 index 0000000..a571d61 --- /dev/null +++ b/crates/async_zip/src/base/write/mod.rs @@ -0,0 +1,290 @@ +// Copyright (c) 2021-2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A module which supports writing ZIP files. +//! +//! # Example +//! ### Whole data (u8 slice) +//! ```no_run +//! # #[cfg(feature = "deflate")] +//! # { +//! # use async_zip::{Compression, ZipEntryBuilder, base::write::ZipFileWriter}; +//! # use async_zip::error::ZipError; +//! # +//! # async fn run() -> Result<(), ZipError> { +//! let mut writer = ZipFileWriter::new(Vec::::new()); +//! +//! let data = b"This is an example file."; +//! let opts = ZipEntryBuilder::new(String::from("foo.txt").into(), Compression::Deflate); +//! +//! writer.write_entry_whole(opts, data).await?; +//! writer.close().await?; +//! # Ok(()) +//! # } +//! # } +//! ``` +//! ### Stream data (unknown size & data) +//! ```no_run +//! # #[cfg(feature = "deflate")] +//! # { +//! # use async_zip::{Compression, ZipEntryBuilder, base::write::ZipFileWriter}; +//! # use std::io::Cursor; +//! # use async_zip::error::ZipError; +//! # use futures_lite::io::AsyncWriteExt; +//! # use tokio_util::compat::TokioAsyncWriteCompatExt; +//! # +//! # async fn run() -> Result<(), ZipError> { +//! let mut writer = ZipFileWriter::new(Vec::::new()); +//! +//! let data = b"This is an example file."; +//! let opts = ZipEntryBuilder::new(String::from("bar.txt").into(), Compression::Deflate); +//! +//! let mut entry_writer = writer.write_entry_stream(opts).await?; +//! entry_writer.write_all(data).await.unwrap(); +//! +//! entry_writer.close().await?; +//! writer.close().await?; +//! # Ok(()) +//! # } +//! # } +//! ``` + +pub(crate) mod compressed_writer; +pub(crate) mod entry_stream; +pub(crate) mod entry_whole; +pub(crate) mod io; + +pub use entry_stream::EntryStreamWriter; + +#[cfg(feature = "tokio")] +use tokio_util::compat::{Compat, TokioAsyncWriteCompatExt}; + +use crate::entry::ZipEntry; +use crate::error::Result; +use crate::spec::extra_field::ExtraFieldAsBytes; +use crate::spec::header::{ + CentralDirectoryRecord, EndOfCentralDirectoryHeader, ExtraField, InfoZipUnicodeCommentExtraField, + InfoZipUnicodePathExtraField, Zip64EndOfCentralDirectoryLocator, Zip64EndOfCentralDirectoryRecord, +}; + +#[cfg(feature = "tokio")] +use crate::tokio::write::ZipFileWriter as TokioZipFileWriter; + +use entry_whole::EntryWholeWriter; +use io::offset::AsyncOffsetWriter; + +use crate::spec::consts::{NON_ZIP64_MAX_NUM_FILES, NON_ZIP64_MAX_SIZE}; +use futures_lite::io::{AsyncWrite, AsyncWriteExt}; + +pub(crate) struct CentralDirectoryEntry { + pub header: CentralDirectoryRecord, + pub entry: ZipEntry, +} + +/// A ZIP file writer which acts over AsyncWrite implementers. +/// +/// # Note +/// - [`ZipFileWriter::close()`] must be called before a stream writer goes out of scope. +pub struct ZipFileWriter { + pub(crate) writer: AsyncOffsetWriter, + pub(crate) cd_entries: Vec, + /// If true, will error if a Zip64 struct must be written. + force_no_zip64: bool, + /// Whether to write Zip64 end of directory structs. + pub(crate) is_zip64: bool, + comment_opt: Option, +} + +impl ZipFileWriter { + /// Construct a new ZIP file writer from a mutable reference to a writer. + pub fn new(writer: W) -> Self { + Self { + writer: AsyncOffsetWriter::new(writer), + cd_entries: Vec::new(), + comment_opt: None, + is_zip64: false, + force_no_zip64: false, + } + } + + /// Force the ZIP writer to operate in non-ZIP64 mode. + /// If any files would need ZIP64, an error will be raised. + pub fn force_no_zip64(mut self) -> Self { + self.force_no_zip64 = true; + self + } + + /// Force the ZIP writer to emit Zip64 structs at the end of the archive. + /// Zip64 extended fields will only be written if needed. + pub fn force_zip64(mut self) -> Self { + self.is_zip64 = true; + self + } + + /// Write a new ZIP entry of known size and data. + pub async fn write_entry_whole>(&mut self, entry: E, data: &[u8]) -> Result<()> { + EntryWholeWriter::from_raw(self, entry.into(), data).write().await + } + + /// Write an entry of unknown size and data via streaming (ie. using a data descriptor). + /// The generated Local File Header will be invalid, with no compressed size, uncompressed size, + /// and a null CRC. This might cause problems with the destination reader. + pub async fn write_entry_stream>(&mut self, entry: E) -> Result> { + EntryStreamWriter::from_raw(self, entry.into()).await + } + + /// Set the ZIP file comment. + pub fn comment(&mut self, comment: String) { + self.comment_opt = Some(comment); + } + + /// Returns a mutable reference to the inner writer. + /// + /// Care should be taken when using this inner writer as doing so may invalidate internal state of this writer. + pub fn inner_mut(&mut self) -> &mut W { + self.writer.inner_mut() + } + + /// Consumes this ZIP writer and completes all closing tasks. + /// + /// This includes: + /// - Writing all central directory headers. + /// - Writing the end of central directory header. + /// - Writing the file comment. + /// + /// Failure to call this function before going out of scope would result in a corrupted ZIP file. + pub async fn close(mut self) -> Result { + let cd_offset = self.writer.offset(); + + for entry in &self.cd_entries { + let filename_basic = + entry.entry.filename().alternative().unwrap_or_else(|| entry.entry.filename().as_bytes()); + let comment_basic = entry.entry.comment().alternative().unwrap_or_else(|| entry.entry.comment().as_bytes()); + + self.writer.write_all(&crate::spec::consts::CDH_SIGNATURE.to_le_bytes()).await?; + self.writer.write_all(&entry.header.as_slice()).await?; + self.writer.write_all(filename_basic).await?; + self.writer.write_all(&entry.entry.extra_fields().as_bytes()).await?; + self.writer.write_all(comment_basic).await?; + } + + let central_directory_size = self.writer.offset() - cd_offset; + let central_directory_size_u32 = if central_directory_size > NON_ZIP64_MAX_SIZE as u64 { + NON_ZIP64_MAX_SIZE + } else { + central_directory_size as u32 + }; + let num_entries_in_directory = self.cd_entries.len() as u64; + let num_entries_in_directory_u16 = if num_entries_in_directory > NON_ZIP64_MAX_NUM_FILES as u64 { + NON_ZIP64_MAX_NUM_FILES + } else { + num_entries_in_directory as u16 + }; + let cd_offset_u32 = if cd_offset > NON_ZIP64_MAX_SIZE as u64 { + if self.force_no_zip64 { + return Err(crate::error::ZipError::Zip64Needed(crate::error::Zip64ErrorCase::LargeFile)); + } else { + self.is_zip64 = true; + } + NON_ZIP64_MAX_SIZE + } else { + cd_offset as u32 + }; + + // Add the zip64 EOCDR and EOCDL if we are in zip64 mode. + if self.is_zip64 { + let eocdr_offset = self.writer.offset(); + + let eocdr = Zip64EndOfCentralDirectoryRecord { + size_of_zip64_end_of_cd_record: 44, + version_made_by: crate::spec::version::as_made_by(), + version_needed_to_extract: 46, + disk_number: 0, + disk_number_start_of_cd: 0, + num_entries_in_directory_on_disk: num_entries_in_directory, + num_entries_in_directory, + directory_size: central_directory_size, + offset_of_start_of_directory: cd_offset, + }; + self.writer.write_all(&crate::spec::consts::ZIP64_EOCDR_SIGNATURE.to_le_bytes()).await?; + self.writer.write_all(&eocdr.as_bytes()).await?; + + let eocdl = Zip64EndOfCentralDirectoryLocator { + number_of_disk_with_start_of_zip64_end_of_central_directory: 0, + relative_offset: eocdr_offset, + total_number_of_disks: 1, + }; + self.writer.write_all(&crate::spec::consts::ZIP64_EOCDL_SIGNATURE.to_le_bytes()).await?; + self.writer.write_all(&eocdl.as_bytes()).await?; + } + + let header = EndOfCentralDirectoryHeader { + disk_num: 0, + start_cent_dir_disk: 0, + num_of_entries_disk: num_entries_in_directory_u16, + num_of_entries: num_entries_in_directory_u16, + size_cent_dir: central_directory_size_u32, + cent_dir_offset: cd_offset_u32, + file_comm_length: self.comment_opt.as_ref().map(|v| v.len() as u16).unwrap_or_default(), + }; + + self.writer.write_all(&crate::spec::consts::EOCDR_SIGNATURE.to_le_bytes()).await?; + self.writer.write_all(&header.as_slice()).await?; + if let Some(comment) = self.comment_opt { + self.writer.write_all(comment.as_bytes()).await?; + } + + Ok(self.writer.into_inner()) + } +} + +#[cfg(feature = "tokio")] +impl ZipFileWriter> +where + W: tokio::io::AsyncWrite + Unpin, +{ + /// Construct a new ZIP file writer from a mutable reference to a writer. + pub fn with_tokio(writer: W) -> TokioZipFileWriter { + Self { + writer: AsyncOffsetWriter::new(writer.compat_write()), + cd_entries: Vec::new(), + comment_opt: None, + is_zip64: false, + force_no_zip64: false, + } + } +} + +pub(crate) fn get_or_put_info_zip_unicode_path_extra_field_mut( + extra_fields: &mut Vec, +) -> &mut InfoZipUnicodePathExtraField { + if !extra_fields.iter().any(|field| matches!(field, ExtraField::InfoZipUnicodePath(_))) { + extra_fields + .push(ExtraField::InfoZipUnicodePath(InfoZipUnicodePathExtraField::V1 { crc32: 0, unicode: vec![] })); + } + + for field in extra_fields.iter_mut() { + if let ExtraField::InfoZipUnicodePath(extra_field) = field { + return extra_field; + } + } + + panic!("InfoZipUnicodePathExtraField not found after insertion") +} + +pub(crate) fn get_or_put_info_zip_unicode_comment_extra_field_mut( + extra_fields: &mut Vec, +) -> &mut InfoZipUnicodeCommentExtraField { + if !extra_fields.iter().any(|field| matches!(field, ExtraField::InfoZipUnicodeComment(_))) { + extra_fields + .push(ExtraField::InfoZipUnicodeComment(InfoZipUnicodeCommentExtraField::V1 { crc32: 0, unicode: vec![] })); + } + + for field in extra_fields.iter_mut() { + if let ExtraField::InfoZipUnicodeComment(extra_field) = field { + return extra_field; + } + } + + panic!("InfoZipUnicodeCommentExtraField not found after insertion") +} diff --git a/crates/async_zip/src/date/builder.rs b/crates/async_zip/src/date/builder.rs new file mode 100644 index 0000000..ea660f9 --- /dev/null +++ b/crates/async_zip/src/date/builder.rs @@ -0,0 +1,83 @@ +// Copyright (c) 2024 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::ZipDateTime; + +/// A builder for [`ZipDateTime`]. +pub struct ZipDateTimeBuilder(pub(crate) ZipDateTime); + +impl From for ZipDateTimeBuilder { + fn from(date: ZipDateTime) -> Self { + Self(date) + } +} + +impl Default for ZipDateTimeBuilder { + fn default() -> Self { + Self::new() + } +} + +impl ZipDateTimeBuilder { + /// Constructs a new builder which defines the raw underlying data of a ZIP entry. + pub fn new() -> Self { + Self(ZipDateTime { date: 0, time: 0 }) + } + + /// Sets the date and time's year. + pub fn year(mut self, year: i32) -> Self { + let year: u16 = (((year - 1980) << 9) & 0xFE00).try_into().unwrap(); + self.0.date |= year; + self + } + + /// Sets the date and time's month. + pub fn month(mut self, month: u32) -> Self { + let month: u16 = ((month << 5) & 0x1E0).try_into().unwrap(); + self.0.date |= month; + self + } + + /// Sets the date and time's day. + pub fn day(mut self, day: u32) -> Self { + let day: u16 = (day & 0x1F).try_into().unwrap(); + self.0.date |= day; + self + } + + /// Sets the date and time's hour. + pub fn hour(mut self, hour: u32) -> Self { + let hour: u16 = ((hour << 11) & 0xF800).try_into().unwrap(); + self.0.time |= hour; + self + } + + /// Sets the date and time's minute. + pub fn minute(mut self, minute: u32) -> Self { + let minute: u16 = ((minute << 5) & 0x7E0).try_into().unwrap(); + self.0.time |= minute; + self + } + + /// Sets the date and time's second. + /// + /// Note that MS-DOS has a maximum granularity of two seconds. + pub fn second(mut self, second: u32) -> Self { + let second: u16 = ((second >> 1) & 0x1F).try_into().unwrap(); + self.0.time |= second; + self + } + + /// Consumes this builder and returns a final [`ZipDateTime`]. + /// + /// This is equivalent to: + /// ``` + /// # use async_zip::{ZipDateTime, ZipDateTimeBuilder, Compression}; + /// # + /// # let builder = ZipDateTimeBuilder::new().year(2024).month(3).day(2); + /// let date: ZipDateTime = builder.into(); + /// ``` + pub fn build(self) -> ZipDateTime { + self.into() + } +} diff --git a/crates/async_zip/src/date/mod.rs b/crates/async_zip/src/date/mod.rs new file mode 100644 index 0000000..3b4fd4a --- /dev/null +++ b/crates/async_zip/src/date/mod.rs @@ -0,0 +1,112 @@ +// Copyright (c) 2021-2024 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub mod builder; + +#[cfg(feature = "chrono")] +use chrono::{DateTime, Datelike, LocalResult, TimeZone, Timelike, Utc}; + +use self::builder::ZipDateTimeBuilder; + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#446 +// https://learn.microsoft.com/en-us/windows/win32/api/oleauto/nf-oleauto-dosdatetimetovarianttime + +/// A date and time stored as per the MS-DOS representation used by ZIP files. +#[derive(Debug, Default, PartialEq, Eq, Clone, Copy, Hash)] +pub struct ZipDateTime { + pub(crate) date: u16, + pub(crate) time: u16, +} + +impl ZipDateTime { + /// Returns the year of this date & time. + pub fn year(&self) -> i32 { + (((self.date & 0xFE00) >> 9) + 1980).into() + } + + /// Returns the month of this date & time. + pub fn month(&self) -> u32 { + ((self.date & 0x1E0) >> 5).into() + } + + /// Returns the day of this date & time. + pub fn day(&self) -> u32 { + (self.date & 0x1F).into() + } + + /// Returns the hour of this date & time. + pub fn hour(&self) -> u32 { + ((self.time & 0xF800) >> 11).into() + } + + /// Returns the minute of this date & time. + pub fn minute(&self) -> u32 { + ((self.time & 0x7E0) >> 5).into() + } + + /// Returns the second of this date & time. + /// + /// Note that MS-DOS has a maximum granularity of two seconds. + pub fn second(&self) -> u32 { + ((self.time & 0x1F) << 1).into() + } + + /// Constructs chrono's [`DateTime`] representation of this date & time. + /// + /// Note that this requires the `chrono` feature. + #[cfg(feature = "chrono")] + pub fn as_chrono(&self) -> LocalResult> { + self.into() + } + + /// Constructs this date & time from chrono's [`DateTime`] representation. + /// + /// Note that this requires the `chrono` feature. + #[cfg(feature = "chrono")] + pub fn from_chrono(dt: &DateTime) -> Self { + dt.into() + } +} + +impl From for ZipDateTime { + fn from(builder: ZipDateTimeBuilder) -> Self { + builder.0 + } +} + +#[cfg(feature = "chrono")] +impl From<&DateTime> for ZipDateTime { + fn from(value: &DateTime) -> Self { + let mut builder = ZipDateTimeBuilder::new(); + + builder = builder.year(value.date_naive().year()); + builder = builder.month(value.date_naive().month()); + builder = builder.day(value.date_naive().day()); + builder = builder.hour(value.time().hour()); + builder = builder.minute(value.time().minute()); + builder = builder.second(value.time().second()); + + builder.build() + } +} + +#[cfg(feature = "chrono")] +impl From<&ZipDateTime> for LocalResult> { + fn from(value: &ZipDateTime) -> Self { + Utc.with_ymd_and_hms(value.year(), value.month(), value.day(), value.hour(), value.minute(), value.second()) + } +} + +#[cfg(feature = "chrono")] +impl From> for ZipDateTime { + fn from(value: DateTime) -> Self { + (&value).into() + } +} + +#[cfg(feature = "chrono")] +impl From for LocalResult> { + fn from(value: ZipDateTime) -> Self { + (&value).into() + } +} diff --git a/crates/async_zip/src/entry/builder.rs b/crates/async_zip/src/entry/builder.rs new file mode 100644 index 0000000..34993c6 --- /dev/null +++ b/crates/async_zip/src/entry/builder.rs @@ -0,0 +1,113 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::entry::ZipEntry; +use crate::spec::{attribute::AttributeCompatibility, header::ExtraField, Compression}; +use crate::{date::ZipDateTime, string::ZipString}; + +/// A builder for [`ZipEntry`]. +pub struct ZipEntryBuilder(pub(crate) ZipEntry); + +impl From for ZipEntryBuilder { + fn from(entry: ZipEntry) -> Self { + Self(entry) + } +} + +impl ZipEntryBuilder { + /// Constructs a new builder which defines the raw underlying data of a ZIP entry. + /// + /// A filename and compression method are needed to construct the builder as minimal parameters. + pub fn new(filename: ZipString, compression: Compression) -> Self { + Self(ZipEntry::new(filename, compression)) + } + + /// Sets the entry's filename. + pub fn filename(mut self, filename: ZipString) -> Self { + self.0.filename = filename; + self + } + + /// Sets the entry's compression method. + pub fn compression(mut self, compression: Compression) -> Self { + self.0.compression = compression; + self + } + + /// Set a size hint for the file, to be written into the local file header. + /// Unlikely to be useful except for the case of streaming files to be Store'd. + /// This size hint does not affect the central directory, nor does it affect whole files. + pub fn size, M: Into>(mut self, compressed_size: N, uncompressed_size: M) -> Self { + self.0.compressed_size = compressed_size.into(); + self.0.uncompressed_size = uncompressed_size.into(); + self + } + + /// Set the deflate compression option. + /// + /// If the compression type isn't deflate, this option has no effect. + #[cfg(any(feature = "deflate", feature = "bzip2", feature = "zstd", feature = "lzma", feature = "xz"))] + pub fn deflate_option(mut self, option: crate::DeflateOption) -> Self { + self.0.compression_level = option.into_level(); + self + } + + /// Sets the entry's attribute host compatibility. + pub fn attribute_compatibility(mut self, compatibility: AttributeCompatibility) -> Self { + self.0.attribute_compatibility = compatibility; + self + } + + /// Sets the entry's last modification date. + pub fn last_modification_date(mut self, date: ZipDateTime) -> Self { + self.0.last_modification_date = date; + self + } + + /// Sets the entry's internal file attribute. + pub fn internal_file_attribute(mut self, attribute: u16) -> Self { + self.0.internal_file_attribute = attribute; + self + } + + /// Sets the entry's external file attribute. + pub fn external_file_attribute(mut self, attribute: u32) -> Self { + self.0.external_file_attribute = attribute; + self + } + + /// Sets the entry's extra field data. + pub fn extra_fields(mut self, field: Vec) -> Self { + self.0.extra_fields = field; + self + } + + /// Sets the entry's file comment. + pub fn comment(mut self, comment: ZipString) -> Self { + self.0.comment = comment; + self + } + + /// Sets the entry's Unix permissions mode. + /// + /// If the attribute host compatibility isn't set to Unix, this will have no effect. + pub fn unix_permissions(mut self, mode: u16) -> Self { + if matches!(self.0.attribute_compatibility, AttributeCompatibility::Unix) { + self.0.external_file_attribute = (self.0.external_file_attribute & 0xFFFF) | (mode as u32) << 16; + } + self + } + + /// Consumes this builder and returns a final [`ZipEntry`]. + /// + /// This is equivalent to: + /// ``` + /// # use async_zip::{ZipEntry, ZipEntryBuilder, Compression}; + /// # + /// # let builder = ZipEntryBuilder::new(String::from("foo.bar").into(), Compression::Stored); + /// let entry: ZipEntry = builder.into(); + /// ``` + pub fn build(self) -> ZipEntry { + self.into() + } +} diff --git a/crates/async_zip/src/entry/mod.rs b/crates/async_zip/src/entry/mod.rs new file mode 100644 index 0000000..a0bd841 --- /dev/null +++ b/crates/async_zip/src/entry/mod.rs @@ -0,0 +1,219 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub mod builder; + +use std::ops::Deref; + +use futures_lite::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt, SeekFrom}; + +use crate::entry::builder::ZipEntryBuilder; +use crate::error::{Result, ZipError}; +use crate::spec::{ + attribute::AttributeCompatibility, + consts::LFH_SIGNATURE, + header::{ExtraField, LocalFileHeader}, + Compression, +}; +use crate::{string::ZipString, ZipDateTime}; + +/// An immutable store of data about a ZIP entry. +/// +/// This type cannot be directly constructed so instead, the [`ZipEntryBuilder`] must be used. Internally this builder +/// stores a [`ZipEntry`] so conversions between these two types via the [`From`] implementations will be +/// non-allocating. +#[derive(Clone, Debug)] +pub struct ZipEntry { + pub(crate) filename: ZipString, + pub(crate) compression: Compression, + #[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" + ))] + pub(crate) compression_level: async_compression::Level, + pub(crate) crc32: u32, + pub(crate) uncompressed_size: u64, + pub(crate) compressed_size: u64, + pub(crate) attribute_compatibility: AttributeCompatibility, + pub(crate) last_modification_date: ZipDateTime, + pub(crate) internal_file_attribute: u16, + pub(crate) external_file_attribute: u32, + pub(crate) extra_fields: Vec, + pub(crate) comment: ZipString, + pub(crate) data_descriptor: bool, +} + +impl From for ZipEntry { + fn from(builder: ZipEntryBuilder) -> Self { + builder.0 + } +} + +impl ZipEntry { + pub(crate) fn new(filename: ZipString, compression: Compression) -> Self { + ZipEntry { + filename, + compression, + #[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" + ))] + compression_level: async_compression::Level::Default, + crc32: 0, + uncompressed_size: 0, + compressed_size: 0, + attribute_compatibility: AttributeCompatibility::Unix, + last_modification_date: ZipDateTime::default(), + internal_file_attribute: 0, + external_file_attribute: 0, + extra_fields: Vec::new(), + comment: String::new().into(), + data_descriptor: false, + } + } + + /// Returns the entry's filename. + /// + /// ## Note + /// This will return the raw filename stored during ZIP creation. If calling this method on entries retrieved from + /// untrusted ZIP files, the filename should be sanitised before being used as a path to prevent [directory + /// traversal attacks](https://en.wikipedia.org/wiki/Directory_traversal_attack). + pub fn filename(&self) -> &ZipString { + &self.filename + } + + /// Returns the entry's compression method. + pub fn compression(&self) -> Compression { + self.compression + } + + /// Returns the entry's CRC32 value. + pub fn crc32(&self) -> u32 { + self.crc32 + } + + /// Returns the entry's uncompressed size. + pub fn uncompressed_size(&self) -> u64 { + self.uncompressed_size + } + + /// Returns the entry's compressed size. + pub fn compressed_size(&self) -> u64 { + self.compressed_size + } + + /// Returns the entry's attribute's host compatibility. + pub fn attribute_compatibility(&self) -> AttributeCompatibility { + self.attribute_compatibility + } + + /// Returns the entry's last modification time & date. + pub fn last_modification_date(&self) -> &ZipDateTime { + &self.last_modification_date + } + + /// Returns the entry's internal file attribute. + pub fn internal_file_attribute(&self) -> u16 { + self.internal_file_attribute + } + + /// Returns the entry's external file attribute + pub fn external_file_attribute(&self) -> u32 { + self.external_file_attribute + } + + /// Returns the entry's extra field data. + pub fn extra_fields(&self) -> &[ExtraField] { + &self.extra_fields + } + + /// Returns the entry's file comment. + pub fn comment(&self) -> &ZipString { + &self.comment + } + + /// Returns the entry's integer-based UNIX permissions. + /// + /// # Note + /// This will return None if the attribute host compatibility is not listed as Unix. + pub fn unix_permissions(&self) -> Option { + if !matches!(self.attribute_compatibility, AttributeCompatibility::Unix) { + return None; + } + + Some(((self.external_file_attribute) >> 16) as u16) + } + + /// Returns whether or not the entry represents a directory. + pub fn dir(&self) -> Result { + Ok(self.filename.as_str()?.ends_with('/')) + } +} + +/// An immutable store of data about how a ZIP entry is stored within a specific archive. +/// +/// Besides storing archive independent information like the size and timestamp it can also be used to query +/// information about how the entry is stored in an archive. +#[derive(Clone)] +pub struct StoredZipEntry { + pub(crate) entry: ZipEntry, + // pub(crate) general_purpose_flag: GeneralPurposeFlag, + pub(crate) file_offset: u64, + pub(crate) header_size: u64, +} + +impl StoredZipEntry { + /// Returns the offset in bytes to where the header of the entry starts. + pub fn header_offset(&self) -> u64 { + self.file_offset + } + + /// Returns the combined size in bytes of the header, the filename, and any extra fields. + /// + /// Note: This uses the extra field length stored in the central directory, which may differ from that stored in + /// the local file header. See specification: + pub fn header_size(&self) -> u64 { + self.header_size + } + + /// Seek to the offset in bytes where the data of the entry starts. + pub(crate) async fn seek_to_data_offset(&self, mut reader: &mut R) -> Result<()> { + // Seek to the header + reader.seek(SeekFrom::Start(self.file_offset)).await?; + + // Check the signature + let signature = { + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer).await?; + u32::from_le_bytes(buffer) + }; + + match signature { + LFH_SIGNATURE => (), + actual => return Err(ZipError::UnexpectedHeaderError(actual, LFH_SIGNATURE)), + }; + + // Skip the local file header and trailing data + let header = LocalFileHeader::from_reader(&mut reader).await?; + let trailing_size = (header.file_name_length as i64) + (header.extra_field_length as i64); + reader.seek(SeekFrom::Current(trailing_size)).await?; + + Ok(()) + } +} + +impl Deref for StoredZipEntry { + type Target = ZipEntry; + + fn deref(&self) -> &Self::Target { + &self.entry + } +} diff --git a/crates/async_zip/src/error.rs b/crates/async_zip/src/error.rs new file mode 100644 index 0000000..f383112 --- /dev/null +++ b/crates/async_zip/src/error.rs @@ -0,0 +1,72 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A module which holds relevant error reporting structures/types. + +use std::fmt::{Display, Formatter}; +use thiserror::Error; + +/// A Result type alias over ZipError to minimise repetition. +pub type Result = std::result::Result; + +#[derive(Debug, PartialEq, Eq)] +pub enum Zip64ErrorCase { + TooManyFiles, + LargeFile, +} + +impl Display for Zip64ErrorCase { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::TooManyFiles => write!(f, "More than 65536 files in archive"), + Self::LargeFile => write!(f, "File is larger than 4 GiB"), + } + } +} + +/// An enum of possible errors and their descriptions. +#[non_exhaustive] +#[derive(Debug, Error)] +pub enum ZipError { + #[error("feature not supported: '{0}'")] + FeatureNotSupported(&'static str), + #[error("compression not supported: {0}")] + CompressionNotSupported(u16), + #[error("host attribute compatibility not supported: {0}")] + AttributeCompatibilityNotSupported(u16), + #[error("attempted to read a ZIP64 file whilst on a 32-bit target")] + TargetZip64NotSupported, + #[error("attempted to write a ZIP file with force_no_zip64 when ZIP64 is needed: {0}")] + Zip64Needed(Zip64ErrorCase), + #[error("end of file has not been reached")] + EOFNotReached, + #[error("extra fields exceeded maximum size")] + ExtraFieldTooLarge, + #[error("comment exceeded maximum size")] + CommentTooLarge, + #[error("filename exceeded maximum size")] + FileNameTooLarge, + #[error("attempted to convert non-UTF8 bytes to a string/str")] + StringNotUtf8, + + #[error("unable to locate the end of central directory record")] + UnableToLocateEOCDR, + #[error("extra field size was indicated to be {0} but only {1} bytes remain")] + InvalidExtraFieldHeader(u16, usize), + #[error("zip64 extended information field was incomplete")] + Zip64ExtendedFieldIncomplete, + + #[error("an upstream reader returned an error: {0}")] + UpstreamReadError(#[from] std::io::Error), + #[error("a computed CRC32 value did not match the expected value")] + CRC32CheckError, + #[error("entry index was out of bounds")] + EntryIndexOutOfBounds, + #[error("Encountered an unexpected header (actual: {0:#x}, expected: {1:#x}).")] + UnexpectedHeaderError(u32, u32), + + #[error("Info-ZIP Unicode Comment Extra Field was incomplete")] + InfoZipUnicodeCommentFieldIncomplete, + #[error("Info-ZIP Unicode Path Extra Field was incomplete")] + InfoZipUnicodePathFieldIncomplete, +} diff --git a/crates/async_zip/src/file/builder.rs b/crates/async_zip/src/file/builder.rs new file mode 100644 index 0000000..209ad16 --- /dev/null +++ b/crates/async_zip/src/file/builder.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::{file::ZipFile, string::ZipString}; + +/// A builder for [`ZipFile`]. +pub struct ZipFileBuilder(pub(crate) ZipFile); + +impl From for ZipFileBuilder { + fn from(file: ZipFile) -> Self { + Self(file) + } +} + +impl Default for ZipFileBuilder { + fn default() -> Self { + ZipFileBuilder(ZipFile { entries: Vec::new(), zip64: false, comment: String::new().into() }) + } +} + +impl ZipFileBuilder { + pub fn new() -> Self { + Self::default() + } + + /// Sets the file's comment. + pub fn comment(mut self, comment: ZipString) -> Self { + self.0.comment = comment; + self + } + + /// Consumes this builder and returns a final [`ZipFile`]. + /// + /// This is equivalent to: + /// ``` + /// # use async_zip::{ZipFile, ZipFileBuilder}; + /// # + /// # let builder = ZipFileBuilder::new(); + /// let file: ZipFile = builder.into(); + /// ``` + pub fn build(self) -> ZipFile { + self.into() + } +} diff --git a/crates/async_zip/src/file/mod.rs b/crates/async_zip/src/file/mod.rs new file mode 100644 index 0000000..f503a8d --- /dev/null +++ b/crates/async_zip/src/file/mod.rs @@ -0,0 +1,38 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod builder; + +use crate::{entry::StoredZipEntry, string::ZipString}; +use builder::ZipFileBuilder; + +/// An immutable store of data about a ZIP file. +#[derive(Clone)] +pub struct ZipFile { + pub(crate) entries: Vec, + pub(crate) zip64: bool, + pub(crate) comment: ZipString, +} + +impl From for ZipFile { + fn from(builder: ZipFileBuilder) -> Self { + builder.0 + } +} + +impl ZipFile { + /// Returns a list of this ZIP file's entries. + pub fn entries(&self) -> &[StoredZipEntry] { + &self.entries + } + + /// Returns this ZIP file's trailing comment. + pub fn comment(&self) -> &ZipString { + &self.comment + } + + /// Returns whether or not this ZIP file is zip64 + pub fn zip64(&self) -> bool { + self.zip64 + } +} diff --git a/crates/async_zip/src/lib.rs b/crates/async_zip/src/lib.rs new file mode 100644 index 0000000..67e8fd5 --- /dev/null +++ b/crates/async_zip/src/lib.rs @@ -0,0 +1,62 @@ +// Copyright (c) 2021-2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +// Document all features on docs.rs +#![cfg_attr(docsrs, feature(doc_cfg))] + +//! An asynchronous ZIP archive reading/writing crate. +//! +//! ## Features +//! - A base implementation atop `futures`'s IO traits. +//! - An extended implementation atop `tokio`'s IO traits. +//! - Support for Stored, Deflate, bzip2, LZMA, zstd, and xz compression methods. +//! - Various different reading approaches (seek, stream, filesystem, in-memory buffer). +//! - Support for writing complete data (u8 slices) or stream writing using data descriptors. +//! - Initial support for ZIP64 reading and writing. +//! - Aims for reasonable [specification](https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md) compliance. +//! +//! ## Installation +//! +//! ```toml +//! [dependencies] +//! async_zip = { version = "0.0.17", features = ["full"] } +//! ``` +//! +//! ### Feature Flags +//! - `full` - Enables all below features. +//! - `full-wasm` - Enables all below features that are compatible with WASM. +//! - `chrono` - Enables support for parsing dates via `chrono`. +//! - `tokio` - Enables support for the `tokio` implementation module. +//! - `tokio-fs` - Enables support for the `tokio::fs` reading module. +//! - `deflate` - Enables support for the Deflate compression method. +//! - `bzip2` - Enables support for the bzip2 compression method. +//! - `lzma` - Enables support for the LZMA compression method. +//! - `zstd` - Enables support for the zstd compression method. +//! - `xz` - Enables support for the xz compression method. +//! +//! [Read more.](https://github.com/Majored/rs-async-zip) + +pub mod base; +pub mod error; + +#[cfg(feature = "tokio")] +pub mod tokio; + +pub(crate) mod date; +pub(crate) mod entry; +pub(crate) mod file; +pub(crate) mod spec; +pub(crate) mod string; +pub(crate) mod utils; + +#[cfg(test)] +pub(crate) mod tests; + +pub use crate::spec::attribute::AttributeCompatibility; +pub use crate::spec::compression::{Compression, DeflateOption}; + +pub use crate::date::{builder::ZipDateTimeBuilder, ZipDateTime}; +pub use crate::entry::{builder::ZipEntryBuilder, StoredZipEntry, ZipEntry}; +pub use crate::file::{builder::ZipFileBuilder, ZipFile}; + +pub use crate::string::{StringEncoding, ZipString}; diff --git a/crates/async_zip/src/spec/attribute.rs b/crates/async_zip/src/spec/attribute.rs new file mode 100644 index 0000000..0764a88 --- /dev/null +++ b/crates/async_zip/src/spec/attribute.rs @@ -0,0 +1,41 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::error::{Result, ZipError}; + +/// An attribute host compatibility supported by this crate. +#[non_exhaustive] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AttributeCompatibility { + Unix, +} + +impl TryFrom for AttributeCompatibility { + type Error = ZipError; + + // Convert a u16 stored with little endianness into a supported attribute host compatibility. + // https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4422 + fn try_from(value: u16) -> Result { + match value { + 3 => Ok(AttributeCompatibility::Unix), + _ => Err(ZipError::AttributeCompatibilityNotSupported(value)), + } + } +} + +impl From<&AttributeCompatibility> for u16 { + // Convert a supported attribute host compatibility into its relevant u16 stored with little endianness. + // https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4422 + fn from(compatibility: &AttributeCompatibility) -> Self { + match compatibility { + AttributeCompatibility::Unix => 3, + } + } +} + +impl From for u16 { + // Convert a supported attribute host compatibility into its relevant u16 stored with little endianness. + fn from(compatibility: AttributeCompatibility) -> Self { + (&compatibility).into() + } +} diff --git a/crates/async_zip/src/spec/compression.rs b/crates/async_zip/src/spec/compression.rs new file mode 100644 index 0000000..01d56a7 --- /dev/null +++ b/crates/async_zip/src/spec/compression.rs @@ -0,0 +1,111 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::error::{Result, ZipError}; + +#[cfg(any(feature = "deflate", feature = "bzip2", feature = "zstd", feature = "lzma", feature = "xz"))] +use async_compression::Level; + +/// A compression method supported by this crate. +#[non_exhaustive] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Compression { + Stored, + #[cfg(feature = "deflate")] + Deflate, + #[cfg(feature = "deflate64")] + Deflate64, + #[cfg(feature = "bzip2")] + Bz, + #[cfg(feature = "lzma")] + Lzma, + #[cfg(feature = "zstd")] + Zstd, + #[cfg(feature = "xz")] + Xz, +} + +impl TryFrom for Compression { + type Error = ZipError; + + // Convert a u16 stored with little endianness into a supported compression method. + // https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#445 + fn try_from(value: u16) -> Result { + match value { + 0 => Ok(Compression::Stored), + #[cfg(feature = "deflate")] + 8 => Ok(Compression::Deflate), + #[cfg(feature = "deflate64")] + 9 => Ok(Compression::Deflate64), + #[cfg(feature = "bzip2")] + 12 => Ok(Compression::Bz), + #[cfg(feature = "lzma")] + 14 => Ok(Compression::Lzma), + #[cfg(feature = "zstd")] + 93 => Ok(Compression::Zstd), + #[cfg(feature = "xz")] + 95 => Ok(Compression::Xz), + _ => Err(ZipError::CompressionNotSupported(value)), + } + } +} + +impl From<&Compression> for u16 { + // Convert a supported compression method into its relevant u16 stored with little endianness. + // https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#445 + fn from(compression: &Compression) -> u16 { + match compression { + Compression::Stored => 0, + #[cfg(feature = "deflate")] + Compression::Deflate => 8, + #[cfg(feature = "deflate64")] + Compression::Deflate64 => 9, + #[cfg(feature = "bzip2")] + Compression::Bz => 12, + #[cfg(feature = "lzma")] + Compression::Lzma => 14, + #[cfg(feature = "zstd")] + Compression::Zstd => 93, + #[cfg(feature = "xz")] + Compression::Xz => 95, + } + } +} + +impl From for u16 { + fn from(compression: Compression) -> u16 { + (&compression).into() + } +} + +/// Level of compression data should be compressed with for deflate. +#[derive(Debug, Clone, Copy)] +pub enum DeflateOption { + // Normal (-en) compression option was used. + Normal, + + // Maximum (-exx/-ex) compression option was used. + Maximum, + + // Fast (-ef) compression option was used. + Fast, + + // Super Fast (-es) compression option was used. + Super, + + /// Other implementation defined level. + Other(i32), +} + +#[cfg(any(feature = "deflate", feature = "bzip2", feature = "zstd", feature = "lzma", feature = "xz"))] +impl DeflateOption { + pub(crate) fn into_level(self) -> Level { + // FIXME: There's no clear documentation on what these specific levels defined in the ZIP specification relate + // to. We want to be compatible with any other library, and not specific to `async_compression`'s levels. + if let Self::Other(l) = self { + Level::Precise(l) + } else { + Level::Default + } + } +} diff --git a/crates/async_zip/src/spec/consts.rs b/crates/async_zip/src/spec/consts.rs new file mode 100644 index 0000000..5500a24 --- /dev/null +++ b/crates/async_zip/src/spec/consts.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub const SIGNATURE_LENGTH: usize = 4; + +// Local file header constants +// +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#437 +pub const LFH_SIGNATURE: u32 = 0x4034b50; +#[allow(dead_code)] +pub const LFH_LENGTH: usize = 26; + +// Central directory header constants +// +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4312 +pub const CDH_SIGNATURE: u32 = 0x2014b50; +#[allow(dead_code)] +pub const CDH_LENGTH: usize = 42; + +// End of central directory record constants +// +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4316 +pub const EOCDR_SIGNATURE: u32 = 0x6054b50; +/// The minimum length of the EOCDR, excluding the signature. +pub const EOCDR_LENGTH: usize = 18; + +/// The signature for the zip64 end of central directory record. +/// Ref: https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4314 +pub const ZIP64_EOCDR_SIGNATURE: u32 = 0x06064b50; +/// The signature for the zip64 end of central directory locator. +/// Ref: https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4315 +pub const ZIP64_EOCDL_SIGNATURE: u32 = 0x07064b50; +/// The length of the ZIP64 EOCDL, including the signature. +/// The EOCDL has a fixed size, thankfully. +pub const ZIP64_EOCDL_LENGTH: u64 = 20; + +/// The contents of a header field when one must reference the zip64 version instead. +pub const NON_ZIP64_MAX_SIZE: u32 = 0xFFFFFFFF; +/// The maximum number of files or disks in a ZIP file before it requires ZIP64. +pub const NON_ZIP64_MAX_NUM_FILES: u16 = 0xFFFF; + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#439 +pub const DATA_DESCRIPTOR_SIGNATURE: u32 = 0x8074b50; +pub const DATA_DESCRIPTOR_LENGTH: usize = 12; diff --git a/crates/async_zip/src/spec/extra_field.rs b/crates/async_zip/src/spec/extra_field.rs new file mode 100644 index 0000000..7506e95 --- /dev/null +++ b/crates/async_zip/src/spec/extra_field.rs @@ -0,0 +1,320 @@ +// Copyright Cognite AS, 2023 + +use crate::error::{Result as ZipResult, ZipError}; +use crate::spec::header::{ + ExtraField, HeaderId, InfoZipUnicodeCommentExtraField, InfoZipUnicodePathExtraField, UnknownExtraField, + Zip64ExtendedInformationExtraField, +}; + +use super::consts::NON_ZIP64_MAX_SIZE; + +pub(crate) trait ExtraFieldAsBytes { + fn as_bytes(&self) -> Vec; + + fn count_bytes(&self) -> usize; +} + +impl ExtraFieldAsBytes for &[ExtraField] { + fn as_bytes(&self) -> Vec { + let mut buffer = Vec::new(); + for field in self.iter() { + buffer.append(&mut field.as_bytes()); + } + buffer + } + + fn count_bytes(&self) -> usize { + self.iter().map(|field| field.count_bytes()).sum() + } +} + +impl ExtraFieldAsBytes for ExtraField { + fn as_bytes(&self) -> Vec { + match self { + ExtraField::Zip64ExtendedInformation(field) => field.as_bytes(), + ExtraField::InfoZipUnicodeComment(field) => field.as_bytes(), + ExtraField::InfoZipUnicodePath(field) => field.as_bytes(), + ExtraField::Unknown(field) => field.as_bytes(), + } + } + + fn count_bytes(&self) -> usize { + match self { + ExtraField::Zip64ExtendedInformation(field) => field.count_bytes(), + ExtraField::InfoZipUnicodeComment(field) => field.count_bytes(), + ExtraField::InfoZipUnicodePath(field) => field.count_bytes(), + ExtraField::Unknown(field) => field.count_bytes(), + } + } +} + +impl ExtraFieldAsBytes for UnknownExtraField { + fn as_bytes(&self) -> Vec { + let mut bytes = Vec::new(); + let header_id: u16 = self.header_id.into(); + bytes.append(&mut header_id.to_le_bytes().to_vec()); + bytes.append(&mut self.data_size.to_le_bytes().to_vec()); + bytes.append(&mut self.content.clone()); + + bytes + } + + fn count_bytes(&self) -> usize { + 4 + self.content.len() + } +} + +impl ExtraFieldAsBytes for Zip64ExtendedInformationExtraField { + fn as_bytes(&self) -> Vec { + let mut bytes = Vec::new(); + let header_id: u16 = self.header_id.into(); + bytes.append(&mut header_id.to_le_bytes().to_vec()); + bytes.append(&mut (self.content_size() as u16).to_le_bytes().to_vec()); + if let Some(uncompressed_size) = &self.uncompressed_size { + bytes.append(&mut uncompressed_size.to_le_bytes().to_vec()); + } + if let Some(compressed_size) = &self.compressed_size { + bytes.append(&mut compressed_size.to_le_bytes().to_vec()); + } + if let Some(relative_header_offset) = &self.relative_header_offset { + bytes.append(&mut relative_header_offset.to_le_bytes().to_vec()); + } + if let Some(disk_start_number) = &self.disk_start_number { + bytes.append(&mut disk_start_number.to_le_bytes().to_vec()); + } + + bytes + } + + fn count_bytes(&self) -> usize { + 4 + self.content_size() + } +} + +impl ExtraFieldAsBytes for InfoZipUnicodeCommentExtraField { + fn as_bytes(&self) -> Vec { + let mut bytes = Vec::new(); + let header_id: u16 = HeaderId::INFO_ZIP_UNICODE_COMMENT_EXTRA_FIELD.into(); + bytes.append(&mut header_id.to_le_bytes().to_vec()); + match self { + InfoZipUnicodeCommentExtraField::V1 { crc32, unicode } => { + let data_size: u16 = (5 + unicode.len()).try_into().unwrap(); + bytes.append(&mut data_size.to_le_bytes().to_vec()); + bytes.push(1); + bytes.append(&mut crc32.to_le_bytes().to_vec()); + bytes.append(&mut unicode.clone()); + } + InfoZipUnicodeCommentExtraField::Unknown { version, data } => { + let data_size: u16 = (1 + data.len()).try_into().unwrap(); + bytes.append(&mut data_size.to_le_bytes().to_vec()); + bytes.push(*version); + bytes.append(&mut data.clone()); + } + } + bytes + } + + fn count_bytes(&self) -> usize { + match self { + InfoZipUnicodeCommentExtraField::V1 { unicode, .. } => 9 + unicode.len(), + InfoZipUnicodeCommentExtraField::Unknown { data, .. } => 5 + data.len(), + } + } +} + +impl ExtraFieldAsBytes for InfoZipUnicodePathExtraField { + fn as_bytes(&self) -> Vec { + let mut bytes = Vec::new(); + let header_id: u16 = HeaderId::INFO_ZIP_UNICODE_PATH_EXTRA_FIELD.into(); + bytes.append(&mut header_id.to_le_bytes().to_vec()); + match self { + InfoZipUnicodePathExtraField::V1 { crc32, unicode } => { + let data_size: u16 = (5 + unicode.len()).try_into().unwrap(); + bytes.append(&mut data_size.to_le_bytes().to_vec()); + bytes.push(1); + bytes.append(&mut crc32.to_le_bytes().to_vec()); + bytes.append(&mut unicode.clone()); + } + InfoZipUnicodePathExtraField::Unknown { version, data } => { + let data_size: u16 = (1 + data.len()).try_into().unwrap(); + bytes.append(&mut data_size.to_le_bytes().to_vec()); + bytes.push(*version); + bytes.append(&mut data.clone()); + } + } + bytes + } + + fn count_bytes(&self) -> usize { + match self { + InfoZipUnicodePathExtraField::V1 { unicode, .. } => 9 + unicode.len(), + InfoZipUnicodePathExtraField::Unknown { data, .. } => 5 + data.len(), + } + } +} + +/// Parse a zip64 extra field from bytes. +/// The content of "data" should exclude the header. +fn zip64_extended_information_field_from_bytes( + header_id: HeaderId, + data: &[u8], + uncompressed_size: u32, + compressed_size: u32, +) -> ZipResult { + // slice.take is nightly-only so we'll just use an index to track the current position + let mut current_idx = 0; + let uncompressed_size = if uncompressed_size == NON_ZIP64_MAX_SIZE && data.len() >= current_idx + 8 { + let val = Some(u64::from_le_bytes(data[current_idx..current_idx + 8].try_into().unwrap())); + current_idx += 8; + val + } else { + None + }; + + let compressed_size = if compressed_size == NON_ZIP64_MAX_SIZE && data.len() >= current_idx + 8 { + let val = Some(u64::from_le_bytes(data[current_idx..current_idx + 8].try_into().unwrap())); + current_idx += 8; + val + } else { + None + }; + + let relative_header_offset = if data.len() >= current_idx + 8 { + let val = Some(u64::from_le_bytes(data[current_idx..current_idx + 8].try_into().unwrap())); + current_idx += 8; + val + } else { + None + }; + + #[allow(unused_assignments)] + let disk_start_number = if data.len() >= current_idx + 4 { + let val = Some(u32::from_le_bytes(data[current_idx..current_idx + 4].try_into().unwrap())); + current_idx += 4; + val + } else { + None + }; + + Ok(Zip64ExtendedInformationExtraField { + header_id, + uncompressed_size, + compressed_size, + relative_header_offset, + disk_start_number, + }) +} + +fn info_zip_unicode_comment_extra_field_from_bytes( + _header_id: HeaderId, + data_size: u16, + data: &[u8], +) -> ZipResult { + if data.is_empty() { + return Err(ZipError::InfoZipUnicodeCommentFieldIncomplete); + } + let version = data[0]; + match version { + 1 => { + if data.len() < 5 { + return Err(ZipError::InfoZipUnicodeCommentFieldIncomplete); + } + let crc32 = u32::from_le_bytes(data[1..5].try_into().unwrap()); + let unicode = data[5..(data_size as usize)].to_vec(); + Ok(InfoZipUnicodeCommentExtraField::V1 { crc32, unicode }) + } + _ => Ok(InfoZipUnicodeCommentExtraField::Unknown { version, data: data[1..(data_size as usize)].to_vec() }), + } +} + +fn info_zip_unicode_path_extra_field_from_bytes( + _header_id: HeaderId, + data_size: u16, + data: &[u8], +) -> ZipResult { + if data.is_empty() { + return Err(ZipError::InfoZipUnicodePathFieldIncomplete); + } + let version = data[0]; + match version { + 1 => { + if data.len() < 5 { + return Err(ZipError::InfoZipUnicodePathFieldIncomplete); + } + let crc32 = u32::from_le_bytes(data[1..5].try_into().unwrap()); + let unicode = data[5..(data_size as usize)].to_vec(); + Ok(InfoZipUnicodePathExtraField::V1 { crc32, unicode }) + } + _ => Ok(InfoZipUnicodePathExtraField::Unknown { version, data: data[1..(data_size as usize)].to_vec() }), + } +} + +pub(crate) fn extra_field_from_bytes( + header_id: HeaderId, + data_size: u16, + data: &[u8], + uncompressed_size: u32, + compressed_size: u32, +) -> ZipResult { + match header_id { + HeaderId::ZIP64_EXTENDED_INFORMATION_EXTRA_FIELD => Ok(ExtraField::Zip64ExtendedInformation( + zip64_extended_information_field_from_bytes(header_id, data, uncompressed_size, compressed_size)?, + )), + HeaderId::INFO_ZIP_UNICODE_COMMENT_EXTRA_FIELD => Ok(ExtraField::InfoZipUnicodeComment( + info_zip_unicode_comment_extra_field_from_bytes(header_id, data_size, data)?, + )), + HeaderId::INFO_ZIP_UNICODE_PATH_EXTRA_FIELD => Ok(ExtraField::InfoZipUnicodePath( + info_zip_unicode_path_extra_field_from_bytes(header_id, data_size, data)?, + )), + _ => Ok(ExtraField::Unknown(UnknownExtraField { header_id, data_size, content: data.to_vec() })), + } +} + +pub struct Zip64ExtendedInformationExtraFieldBuilder { + field: Zip64ExtendedInformationExtraField, +} + +impl Zip64ExtendedInformationExtraFieldBuilder { + pub fn new() -> Self { + Self { + field: Zip64ExtendedInformationExtraField { + header_id: HeaderId::ZIP64_EXTENDED_INFORMATION_EXTRA_FIELD, + uncompressed_size: None, + compressed_size: None, + relative_header_offset: None, + disk_start_number: None, + }, + } + } + + pub fn sizes(mut self, compressed_size: u64, uncompressed_size: u64) -> Self { + self.field.compressed_size = Some(compressed_size); + self.field.uncompressed_size = Some(uncompressed_size); + self + } + + pub fn relative_header_offset(mut self, relative_header_offset: u64) -> Self { + self.field.relative_header_offset = Some(relative_header_offset); + self + } + + #[allow(dead_code)] + pub fn disk_start_number(mut self, disk_start_number: u32) -> Self { + self.field.disk_start_number = Some(disk_start_number); + self + } + + pub fn eof_only(&self) -> bool { + (self.field.uncompressed_size.is_none() && self.field.compressed_size.is_none()) + && (self.field.relative_header_offset.is_some() || self.field.disk_start_number.is_some()) + } + + pub fn build(self) -> ZipResult { + let field = self.field; + + if field.content_size() == 0 { + return Err(ZipError::Zip64ExtendedFieldIncomplete); + } + Ok(field) + } +} diff --git a/crates/async_zip/src/spec/header.rs b/crates/async_zip/src/spec/header.rs new file mode 100644 index 0000000..f7c4392 --- /dev/null +++ b/crates/async_zip/src/spec/header.rs @@ -0,0 +1,161 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#437 +pub struct LocalFileHeader { + pub version: u16, + pub flags: GeneralPurposeFlag, + pub compression: u16, + pub mod_time: u16, + pub mod_date: u16, + pub crc: u32, + pub compressed_size: u32, + pub uncompressed_size: u32, + pub file_name_length: u16, + pub extra_field_length: u16, +} + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#444 +#[derive(Copy, Clone)] +pub struct GeneralPurposeFlag { + pub encrypted: bool, + pub data_descriptor: bool, + pub filename_unicode: bool, +} + +/// 2 byte header ids +/// Ref https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#452 +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct HeaderId(pub u16); + +impl HeaderId { + pub const ZIP64_EXTENDED_INFORMATION_EXTRA_FIELD: HeaderId = HeaderId(0x0001); + pub const INFO_ZIP_UNICODE_COMMENT_EXTRA_FIELD: HeaderId = HeaderId(0x6375); + pub const INFO_ZIP_UNICODE_PATH_EXTRA_FIELD: HeaderId = HeaderId(0x7075); +} + +impl From for HeaderId { + fn from(value: u16) -> Self { + HeaderId(value) + } +} + +impl From for u16 { + fn from(value: HeaderId) -> Self { + value.0 + } +} + +/// Represents each extra field. +/// Not strictly part of the spec, but is the most useful way to represent the data. +#[derive(Clone, Debug)] +#[non_exhaustive] +pub enum ExtraField { + Zip64ExtendedInformation(Zip64ExtendedInformationExtraField), + InfoZipUnicodeComment(InfoZipUnicodeCommentExtraField), + InfoZipUnicodePath(InfoZipUnicodePathExtraField), + Unknown(UnknownExtraField), +} + +/// An extended information header for Zip64. +/// This field is used both for local file headers and central directory records. +/// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#453 +#[derive(Clone, Debug)] +pub struct Zip64ExtendedInformationExtraField { + pub header_id: HeaderId, + pub uncompressed_size: Option, + pub compressed_size: Option, + // While not specified in the spec, these two fields are often left out in practice. + pub relative_header_offset: Option, + pub disk_start_number: Option, +} + +impl Zip64ExtendedInformationExtraField { + pub(crate) fn content_size(&self) -> usize { + self.uncompressed_size.map(|_| 8).unwrap_or_default() + + self.compressed_size.map(|_| 8).unwrap_or_default() + + self.relative_header_offset.map(|_| 8).unwrap_or_default() + + self.disk_start_number.map(|_| 8).unwrap_or_default() + } +} + +/// Stores the UTF-8 version of the file comment as stored in the central directory header. +/// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#468 +#[derive(Clone, Debug)] +pub enum InfoZipUnicodeCommentExtraField { + V1 { crc32: u32, unicode: Vec }, + Unknown { version: u8, data: Vec }, +} + +/// Stores the UTF-8 version of the file name field as stored in the local header and central directory header. +/// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#469 +#[derive(Clone, Debug)] +pub enum InfoZipUnicodePathExtraField { + V1 { crc32: u32, unicode: Vec }, + Unknown { version: u8, data: Vec }, +} + +/// Represents any unparsed extra field. +#[derive(Clone, Debug)] +pub struct UnknownExtraField { + pub header_id: HeaderId, + pub data_size: u16, + pub content: Vec, +} + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4312 +pub struct CentralDirectoryRecord { + pub v_made_by: u16, + pub v_needed: u16, + pub flags: GeneralPurposeFlag, + pub compression: u16, + pub mod_time: u16, + pub mod_date: u16, + pub crc: u32, + pub compressed_size: u32, + pub uncompressed_size: u32, + pub file_name_length: u16, + pub extra_field_length: u16, + pub file_comment_length: u16, + pub disk_start: u16, + pub inter_attr: u16, + pub exter_attr: u32, + pub lh_offset: u32, +} + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4316 +#[derive(Debug)] +pub struct EndOfCentralDirectoryHeader { + pub(crate) disk_num: u16, + pub(crate) start_cent_dir_disk: u16, + pub(crate) num_of_entries_disk: u16, + pub(crate) num_of_entries: u16, + pub(crate) size_cent_dir: u32, + pub(crate) cent_dir_offset: u32, + pub(crate) file_comm_length: u16, +} + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4314 +#[derive(Debug, PartialEq)] +pub struct Zip64EndOfCentralDirectoryRecord { + /// The size of this Zip64EndOfCentralDirectoryRecord. + /// This is specified because there is a variable-length extra zip64 information sector. + /// However, we will gleefully ignore this sector because it is reserved for use by PKWare. + pub size_of_zip64_end_of_cd_record: u64, + pub version_made_by: u16, + pub version_needed_to_extract: u16, + pub disk_number: u32, + pub disk_number_start_of_cd: u32, + pub num_entries_in_directory_on_disk: u64, + pub num_entries_in_directory: u64, + pub directory_size: u64, + pub offset_of_start_of_directory: u64, +} + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#4315 +#[derive(Debug, PartialEq)] +pub struct Zip64EndOfCentralDirectoryLocator { + pub number_of_disk_with_start_of_zip64_end_of_central_directory: u32, + pub relative_offset: u64, + pub total_number_of_disks: u32, +} diff --git a/crates/async_zip/src/spec/mod.rs b/crates/async_zip/src/spec/mod.rs new file mode 100644 index 0000000..1a91ef9 --- /dev/null +++ b/crates/async_zip/src/spec/mod.rs @@ -0,0 +1,12 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod attribute; +pub(crate) mod compression; +pub(crate) mod consts; +pub(crate) mod extra_field; +pub(crate) mod header; +pub(crate) mod parse; +pub(crate) mod version; + +pub use compression::Compression; diff --git a/crates/async_zip/src/spec/parse.rs b/crates/async_zip/src/spec/parse.rs new file mode 100644 index 0000000..422d468 --- /dev/null +++ b/crates/async_zip/src/spec/parse.rs @@ -0,0 +1,345 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::error::{Result, ZipError}; +use crate::spec::header::{ + CentralDirectoryRecord, EndOfCentralDirectoryHeader, ExtraField, GeneralPurposeFlag, HeaderId, LocalFileHeader, + Zip64EndOfCentralDirectoryLocator, Zip64EndOfCentralDirectoryRecord, +}; + +use futures_lite::io::{AsyncRead, AsyncReadExt}; + +impl LocalFileHeader { + pub fn as_slice(&self) -> [u8; 26] { + let mut array = [0; 26]; + let mut cursor = 0; + + array_push!(array, cursor, self.version.to_le_bytes()); + array_push!(array, cursor, self.flags.as_slice()); + array_push!(array, cursor, self.compression.to_le_bytes()); + array_push!(array, cursor, self.mod_time.to_le_bytes()); + array_push!(array, cursor, self.mod_date.to_le_bytes()); + array_push!(array, cursor, self.crc.to_le_bytes()); + array_push!(array, cursor, self.compressed_size.to_le_bytes()); + array_push!(array, cursor, self.uncompressed_size.to_le_bytes()); + array_push!(array, cursor, self.file_name_length.to_le_bytes()); + array_push!(array, cursor, self.extra_field_length.to_le_bytes()); + + array + } +} + +impl GeneralPurposeFlag { + pub fn as_slice(&self) -> [u8; 2] { + let encrypted: u16 = match self.encrypted { + false => 0x0, + true => 0b1, + }; + let data_descriptor: u16 = match self.data_descriptor { + false => 0x0, + true => 0x8, + }; + let filename_unicode: u16 = match self.filename_unicode { + false => 0x0, + true => 0x800, + }; + + (encrypted | data_descriptor | filename_unicode).to_le_bytes() + } +} + +impl CentralDirectoryRecord { + pub fn as_slice(&self) -> [u8; 42] { + let mut array = [0; 42]; + let mut cursor = 0; + + array_push!(array, cursor, self.v_made_by.to_le_bytes()); + array_push!(array, cursor, self.v_needed.to_le_bytes()); + array_push!(array, cursor, self.flags.as_slice()); + array_push!(array, cursor, self.compression.to_le_bytes()); + array_push!(array, cursor, self.mod_time.to_le_bytes()); + array_push!(array, cursor, self.mod_date.to_le_bytes()); + array_push!(array, cursor, self.crc.to_le_bytes()); + array_push!(array, cursor, self.compressed_size.to_le_bytes()); + array_push!(array, cursor, self.uncompressed_size.to_le_bytes()); + array_push!(array, cursor, self.file_name_length.to_le_bytes()); + array_push!(array, cursor, self.extra_field_length.to_le_bytes()); + array_push!(array, cursor, self.file_comment_length.to_le_bytes()); + array_push!(array, cursor, self.disk_start.to_le_bytes()); + array_push!(array, cursor, self.inter_attr.to_le_bytes()); + array_push!(array, cursor, self.exter_attr.to_le_bytes()); + array_push!(array, cursor, self.lh_offset.to_le_bytes()); + + array + } +} + +impl EndOfCentralDirectoryHeader { + pub fn as_slice(&self) -> [u8; 18] { + let mut array = [0; 18]; + let mut cursor = 0; + + array_push!(array, cursor, self.disk_num.to_le_bytes()); + array_push!(array, cursor, self.start_cent_dir_disk.to_le_bytes()); + array_push!(array, cursor, self.num_of_entries_disk.to_le_bytes()); + array_push!(array, cursor, self.num_of_entries.to_le_bytes()); + array_push!(array, cursor, self.size_cent_dir.to_le_bytes()); + array_push!(array, cursor, self.cent_dir_offset.to_le_bytes()); + array_push!(array, cursor, self.file_comm_length.to_le_bytes()); + + array + } +} + +impl From<[u8; 26]> for LocalFileHeader { + fn from(value: [u8; 26]) -> LocalFileHeader { + LocalFileHeader { + version: u16::from_le_bytes(value[0..2].try_into().unwrap()), + flags: GeneralPurposeFlag::from(u16::from_le_bytes(value[2..4].try_into().unwrap())), + compression: u16::from_le_bytes(value[4..6].try_into().unwrap()), + mod_time: u16::from_le_bytes(value[6..8].try_into().unwrap()), + mod_date: u16::from_le_bytes(value[8..10].try_into().unwrap()), + crc: u32::from_le_bytes(value[10..14].try_into().unwrap()), + compressed_size: u32::from_le_bytes(value[14..18].try_into().unwrap()), + uncompressed_size: u32::from_le_bytes(value[18..22].try_into().unwrap()), + file_name_length: u16::from_le_bytes(value[22..24].try_into().unwrap()), + extra_field_length: u16::from_le_bytes(value[24..26].try_into().unwrap()), + } + } +} + +impl From for GeneralPurposeFlag { + fn from(value: u16) -> GeneralPurposeFlag { + let encrypted = !matches!(value & 0x1, 0); + let data_descriptor = !matches!((value & 0x8) >> 3, 0); + let filename_unicode = !matches!((value & 0x800) >> 11, 0); + + GeneralPurposeFlag { encrypted, data_descriptor, filename_unicode } + } +} + +impl From<[u8; 42]> for CentralDirectoryRecord { + fn from(value: [u8; 42]) -> CentralDirectoryRecord { + CentralDirectoryRecord { + v_made_by: u16::from_le_bytes(value[0..2].try_into().unwrap()), + v_needed: u16::from_le_bytes(value[2..4].try_into().unwrap()), + flags: GeneralPurposeFlag::from(u16::from_le_bytes(value[4..6].try_into().unwrap())), + compression: u16::from_le_bytes(value[6..8].try_into().unwrap()), + mod_time: u16::from_le_bytes(value[8..10].try_into().unwrap()), + mod_date: u16::from_le_bytes(value[10..12].try_into().unwrap()), + crc: u32::from_le_bytes(value[12..16].try_into().unwrap()), + compressed_size: u32::from_le_bytes(value[16..20].try_into().unwrap()), + uncompressed_size: u32::from_le_bytes(value[20..24].try_into().unwrap()), + file_name_length: u16::from_le_bytes(value[24..26].try_into().unwrap()), + extra_field_length: u16::from_le_bytes(value[26..28].try_into().unwrap()), + file_comment_length: u16::from_le_bytes(value[28..30].try_into().unwrap()), + disk_start: u16::from_le_bytes(value[30..32].try_into().unwrap()), + inter_attr: u16::from_le_bytes(value[32..34].try_into().unwrap()), + exter_attr: u32::from_le_bytes(value[34..38].try_into().unwrap()), + lh_offset: u32::from_le_bytes(value[38..42].try_into().unwrap()), + } + } +} + +impl From<[u8; 18]> for EndOfCentralDirectoryHeader { + fn from(value: [u8; 18]) -> EndOfCentralDirectoryHeader { + EndOfCentralDirectoryHeader { + disk_num: u16::from_le_bytes(value[0..2].try_into().unwrap()), + start_cent_dir_disk: u16::from_le_bytes(value[2..4].try_into().unwrap()), + num_of_entries_disk: u16::from_le_bytes(value[4..6].try_into().unwrap()), + num_of_entries: u16::from_le_bytes(value[6..8].try_into().unwrap()), + size_cent_dir: u32::from_le_bytes(value[8..12].try_into().unwrap()), + cent_dir_offset: u32::from_le_bytes(value[12..16].try_into().unwrap()), + file_comm_length: u16::from_le_bytes(value[16..18].try_into().unwrap()), + } + } +} + +impl From<[u8; 52]> for Zip64EndOfCentralDirectoryRecord { + fn from(value: [u8; 52]) -> Self { + Self { + size_of_zip64_end_of_cd_record: u64::from_le_bytes(value[0..8].try_into().unwrap()), + version_made_by: u16::from_le_bytes(value[8..10].try_into().unwrap()), + version_needed_to_extract: u16::from_le_bytes(value[10..12].try_into().unwrap()), + disk_number: u32::from_le_bytes(value[12..16].try_into().unwrap()), + disk_number_start_of_cd: u32::from_le_bytes(value[16..20].try_into().unwrap()), + num_entries_in_directory_on_disk: u64::from_le_bytes(value[20..28].try_into().unwrap()), + num_entries_in_directory: u64::from_le_bytes(value[28..36].try_into().unwrap()), + directory_size: u64::from_le_bytes(value[36..44].try_into().unwrap()), + offset_of_start_of_directory: u64::from_le_bytes(value[44..52].try_into().unwrap()), + } + } +} + +impl From<[u8; 16]> for Zip64EndOfCentralDirectoryLocator { + fn from(value: [u8; 16]) -> Self { + Self { + number_of_disk_with_start_of_zip64_end_of_central_directory: u32::from_le_bytes( + value[0..4].try_into().unwrap(), + ), + relative_offset: u64::from_le_bytes(value[4..12].try_into().unwrap()), + total_number_of_disks: u32::from_le_bytes(value[12..16].try_into().unwrap()), + } + } +} + +impl LocalFileHeader { + pub async fn from_reader(reader: &mut R) -> Result { + let mut buffer: [u8; 26] = [0; 26]; + reader.read_exact(&mut buffer).await?; + Ok(LocalFileHeader::from(buffer)) + } +} + +impl EndOfCentralDirectoryHeader { + pub async fn from_reader(reader: &mut R) -> Result { + let mut buffer: [u8; 18] = [0; 18]; + reader.read_exact(&mut buffer).await?; + Ok(EndOfCentralDirectoryHeader::from(buffer)) + } +} + +impl CentralDirectoryRecord { + pub async fn from_reader(reader: &mut R) -> Result { + let mut buffer: [u8; 42] = [0; 42]; + reader.read_exact(&mut buffer).await?; + Ok(CentralDirectoryRecord::from(buffer)) + } +} + +impl Zip64EndOfCentralDirectoryRecord { + pub async fn from_reader(reader: &mut R) -> Result { + let mut buffer: [u8; 52] = [0; 52]; + reader.read_exact(&mut buffer).await?; + Ok(Self::from(buffer)) + } + + pub fn as_bytes(&self) -> [u8; 52] { + let mut array = [0; 52]; + let mut cursor = 0; + + array_push!(array, cursor, self.size_of_zip64_end_of_cd_record.to_le_bytes()); + array_push!(array, cursor, self.version_made_by.to_le_bytes()); + array_push!(array, cursor, self.version_needed_to_extract.to_le_bytes()); + array_push!(array, cursor, self.disk_number.to_le_bytes()); + array_push!(array, cursor, self.disk_number_start_of_cd.to_le_bytes()); + array_push!(array, cursor, self.num_entries_in_directory_on_disk.to_le_bytes()); + array_push!(array, cursor, self.num_entries_in_directory.to_le_bytes()); + array_push!(array, cursor, self.directory_size.to_le_bytes()); + array_push!(array, cursor, self.offset_of_start_of_directory.to_le_bytes()); + + array + } +} + +impl Zip64EndOfCentralDirectoryLocator { + /// Read 4 bytes from the reader and check whether its signature matches that of the EOCDL. + /// If it does, return Some(EOCDL), otherwise return None. + pub async fn try_from_reader( + reader: &mut R, + ) -> Result> { + let signature = { + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer).await?; + u32::from_le_bytes(buffer) + }; + if signature != ZIP64_EOCDL_SIGNATURE { + return Ok(None); + } + let mut buffer: [u8; 16] = [0; 16]; + reader.read_exact(&mut buffer).await?; + Ok(Some(Self::from(buffer))) + } + + pub fn as_bytes(&self) -> [u8; 16] { + let mut array = [0; 16]; + let mut cursor = 0; + + array_push!(array, cursor, self.number_of_disk_with_start_of_zip64_end_of_central_directory.to_le_bytes()); + array_push!(array, cursor, self.relative_offset.to_le_bytes()); + array_push!(array, cursor, self.total_number_of_disks.to_le_bytes()); + + array + } +} + +/// Parse the extra fields. +pub fn parse_extra_fields(data: Vec, uncompressed_size: u32, compressed_size: u32) -> Result> { + let mut cursor = 0; + let mut extra_fields = Vec::new(); + while cursor + 4 < data.len() { + let header_id: HeaderId = u16::from_le_bytes(data[cursor..cursor + 2].try_into().unwrap()).into(); + let field_size = u16::from_le_bytes(data[cursor + 2..cursor + 4].try_into().unwrap()); + if cursor + 4 + field_size as usize > data.len() { + return Err(ZipError::InvalidExtraFieldHeader(field_size, data.len() - cursor - 8 - field_size as usize)); + } + let data = &data[cursor + 4..cursor + 4 + field_size as usize]; + extra_fields.push(extra_field_from_bytes(header_id, field_size, data, uncompressed_size, compressed_size)?); + cursor += 4 + field_size as usize; + } + Ok(extra_fields) +} + +/// Replace elements of an array at a given cursor index for use with a zero-initialised array. +macro_rules! array_push { + ($arr:ident, $cursor:ident, $value:expr) => {{ + for entry in $value { + $arr[$cursor] = entry; + $cursor += 1; + } + }}; +} + +use crate::spec::consts::ZIP64_EOCDL_SIGNATURE; +use crate::spec::extra_field::extra_field_from_bytes; +pub(crate) use array_push; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_zip64_eocdr() { + let eocdr: [u8; 56] = [ + 0x50, 0x4B, 0x06, 0x06, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x03, 0x2D, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ]; + + let without_signature: [u8; 52] = eocdr[4..56].try_into().unwrap(); + let zip64eocdr = Zip64EndOfCentralDirectoryRecord::from(without_signature); + assert_eq!( + zip64eocdr, + Zip64EndOfCentralDirectoryRecord { + size_of_zip64_end_of_cd_record: 44, + version_made_by: 798, + version_needed_to_extract: 45, + disk_number: 0, + disk_number_start_of_cd: 0, + num_entries_in_directory_on_disk: 1, + num_entries_in_directory: 1, + directory_size: 47, + offset_of_start_of_directory: 64, + } + ) + } + + #[tokio::test] + async fn test_parse_zip64_eocdl() { + let eocdl: [u8; 20] = [ + 0x50, 0x4B, 0x06, 0x07, 0x00, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, + ]; + let mut cursor = futures_lite::io::Cursor::new(eocdl); + let zip64eocdl = Zip64EndOfCentralDirectoryLocator::try_from_reader(&mut cursor).await.unwrap().unwrap(); + assert_eq!( + zip64eocdl, + Zip64EndOfCentralDirectoryLocator { + number_of_disk_with_start_of_zip64_end_of_central_directory: 0, + relative_offset: 111, + total_number_of_disks: 1, + } + ) + } +} diff --git a/crates/async_zip/src/spec/version.rs b/crates/async_zip/src/spec/version.rs new file mode 100644 index 0000000..cf0e69d --- /dev/null +++ b/crates/async_zip/src/spec/version.rs @@ -0,0 +1,42 @@ +// Copyright (c) 2021 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::entry::ZipEntry; +#[cfg(any( + feature = "deflate", + feature = "bzip2", + feature = "zstd", + feature = "lzma", + feature = "xz", + feature = "deflate64" +))] +use crate::spec::Compression; + +pub(crate) const SPEC_VERSION_MADE_BY: u16 = 63; + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#443 +pub fn as_needed_to_extract(entry: &ZipEntry) -> u16 { + let mut version = match entry.compression() { + #[cfg(feature = "deflate")] + Compression::Deflate => 20, + #[cfg(feature = "deflate64")] + Compression::Deflate64 => 21, + #[cfg(feature = "bzip2")] + Compression::Bz => 46, + #[cfg(feature = "lzma")] + Compression::Lzma => 63, + _ => 10, + }; + + if let Ok(true) = entry.dir() { + version = std::cmp::max(version, 20); + } + + version +} + +// https://github.com/Majored/rs-async-zip/blob/main/SPECIFICATION.md#442 +pub fn as_made_by() -> u16 { + // Default to UNIX mapping for the moment. + 3 << 8 | SPEC_VERSION_MADE_BY +} diff --git a/crates/async_zip/src/string.rs b/crates/async_zip/src/string.rs new file mode 100644 index 0000000..4bab8ed --- /dev/null +++ b/crates/async_zip/src/string.rs @@ -0,0 +1,112 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::error::{Result, ZipError}; + +/// A string encoding supported by this crate. +#[derive(Debug, Clone, Copy)] +pub enum StringEncoding { + Utf8, + Raw, +} + +/// A string wrapper for handling different encodings. +#[derive(Debug, Clone)] +pub struct ZipString { + encoding: StringEncoding, + raw: Vec, + alternative: Option>, +} + +impl ZipString { + /// Constructs a new encoded string from its raw bytes and its encoding type. + /// + /// # Note + /// If the provided encoding is [`StringEncoding::Utf8`] but the raw bytes are not valid UTF-8 (ie. a call to + /// `std::str::from_utf8()` fails), the encoding is defaulted back to [`StringEncoding::Raw`]. + pub fn new(raw: Vec, mut encoding: StringEncoding) -> Self { + if let StringEncoding::Utf8 = encoding { + if std::str::from_utf8(&raw).is_err() { + encoding = StringEncoding::Raw; + } + } + + Self { encoding, raw, alternative: None } + } + + /// Constructs a new encoded string from utf-8 data, with an alternative in native MBCS encoding. + pub fn new_with_alternative(utf8: String, alternative: Vec) -> Self { + Self { encoding: StringEncoding::Utf8, raw: utf8.into_bytes(), alternative: Some(alternative) } + } + + /// Returns the raw bytes for this string. + pub fn as_bytes(&self) -> &[u8] { + &self.raw + } + + /// Returns the encoding type for this string. + pub fn encoding(&self) -> StringEncoding { + self.encoding + } + + /// Returns the alternative bytes (in native MBCS encoding) for this string. + pub fn alternative(&self) -> Option<&[u8]> { + self.alternative.as_deref() + } + + /// Returns the raw bytes converted into a string slice. + /// + /// # Note + /// A call to this method will only succeed if the encoding type is [`StringEncoding::Utf8`]. + pub fn as_str(&self) -> Result<&str> { + if !matches!(self.encoding, StringEncoding::Utf8) { + return Err(ZipError::StringNotUtf8); + } + + // SAFETY: + // "The bytes passed in must be valid UTF-8.' + // + // This function will error if self.encoding is not StringEncoding::Utf8. + // + // self.encoding is only ever StringEncoding::Utf8 if this variant was provided to the constructor AND the + // call to `std::str::from_utf8()` within the constructor succeeded. Mutable access to the inner vector is + // never given and no method implemented on this type mutates the inner vector. + + Ok(unsafe { std::str::from_utf8_unchecked(&self.raw) }) + } + + /// Returns the raw bytes converted to an owned string. + /// + /// # Note + /// A call to this method will only succeed if the encoding type is [`StringEncoding::Utf8`]. + pub fn into_string(self) -> Result { + if !matches!(self.encoding, StringEncoding::Utf8) { + return Err(ZipError::StringNotUtf8); + } + + // SAFETY: See above. + Ok(unsafe { String::from_utf8_unchecked(self.raw) }) + } + + /// Returns the alternative bytes (in native MBCS encoding) converted to the owned. + pub fn into_alternative(self) -> Option> { + self.alternative + } + + /// Returns whether this string is encoded as utf-8 without an alternative. + pub fn is_utf8_without_alternative(&self) -> bool { + matches!(self.encoding, StringEncoding::Utf8) && self.alternative.is_none() + } +} + +impl From for ZipString { + fn from(value: String) -> Self { + Self { encoding: StringEncoding::Utf8, raw: value.into_bytes(), alternative: None } + } +} + +impl From<&str> for ZipString { + fn from(value: &str) -> Self { + Self { encoding: StringEncoding::Utf8, raw: value.as_bytes().to_vec(), alternative: None } + } +} diff --git a/crates/async_zip/src/tests/combined/mod.rs b/crates/async_zip/src/tests/combined/mod.rs new file mode 100644 index 0000000..d17ab7d --- /dev/null +++ b/crates/async_zip/src/tests/combined/mod.rs @@ -0,0 +1,2 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) diff --git a/crates/async_zip/src/tests/mod.rs b/crates/async_zip/src/tests/mod.rs new file mode 100644 index 0000000..35ecf91 --- /dev/null +++ b/crates/async_zip/src/tests/mod.rs @@ -0,0 +1,16 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod combined; +pub(crate) mod read; +pub(crate) mod spec; +pub(crate) mod write; + +use std::sync::Once; +static ENV_LOGGER: Once = Once::new(); + +/// Initialize the env logger for any tests that require it. +/// Safe to call multiple times. +fn init_logger() { + ENV_LOGGER.call_once(|| env_logger::Builder::from_default_env().format_module_path(true).init()); +} diff --git a/crates/async_zip/src/tests/read/compression/bzip2.data b/crates/async_zip/src/tests/read/compression/bzip2.data new file mode 100644 index 0000000..bff81d5 Binary files /dev/null and b/crates/async_zip/src/tests/read/compression/bzip2.data differ diff --git a/crates/async_zip/src/tests/read/compression/deflate.data b/crates/async_zip/src/tests/read/compression/deflate.data new file mode 100644 index 0000000..283d32b Binary files /dev/null and b/crates/async_zip/src/tests/read/compression/deflate.data differ diff --git a/crates/async_zip/src/tests/read/compression/lzma.data b/crates/async_zip/src/tests/read/compression/lzma.data new file mode 100644 index 0000000..08f95f7 Binary files /dev/null and b/crates/async_zip/src/tests/read/compression/lzma.data differ diff --git a/crates/async_zip/src/tests/read/compression/mod.rs b/crates/async_zip/src/tests/read/compression/mod.rs new file mode 100644 index 0000000..eadd8d6 --- /dev/null +++ b/crates/async_zip/src/tests/read/compression/mod.rs @@ -0,0 +1,46 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::read::io::compressed::CompressedReader; +use crate::spec::Compression; + +compressed_test_helper!(stored_test, Compression::Stored, "foo bar", "foo bar"); + +#[cfg(feature = "deflate")] +compressed_test_helper!(deflate_test, Compression::Deflate, "foo bar", include_bytes!("deflate.data")); + +#[cfg(feature = "bzip2")] +compressed_test_helper!(bz_test, Compression::Bz, "foo bar", include_bytes!("bzip2.data")); + +#[cfg(feature = "lzma")] +compressed_test_helper!(lzma_test, Compression::Lzma, "foo bar", include_bytes!("lzma.data")); + +#[cfg(feature = "zstd")] +compressed_test_helper!(zstd_test, Compression::Zstd, "foo bar", include_bytes!("zstd.data")); + +#[cfg(feature = "xz")] +compressed_test_helper!(xz_test, Compression::Xz, "foo bar", include_bytes!("xz.data")); + +/// A helper macro for generating a CompressedReader test using a specific compression method. +macro_rules! compressed_test_helper { + ($name:ident, $typ:expr, $data_raw:expr, $data:expr) => { + #[cfg(test)] + #[tokio::test] + async fn $name() { + use futures_lite::io::{AsyncReadExt, Cursor}; + + let data = $data; + let data_raw = $data_raw; + + let cursor = Cursor::new(data); + let mut reader = CompressedReader::new(cursor, $typ); + + let mut read_data = String::new(); + reader.read_to_string(&mut read_data).await.expect("read into CompressedReader failed"); + + assert_eq!(read_data, data_raw); + } + }; +} + +use compressed_test_helper; diff --git a/crates/async_zip/src/tests/read/compression/xz.data b/crates/async_zip/src/tests/read/compression/xz.data new file mode 100644 index 0000000..058526b Binary files /dev/null and b/crates/async_zip/src/tests/read/compression/xz.data differ diff --git a/crates/async_zip/src/tests/read/compression/zstd.data b/crates/async_zip/src/tests/read/compression/zstd.data new file mode 100644 index 0000000..beaa09f Binary files /dev/null and b/crates/async_zip/src/tests/read/compression/zstd.data differ diff --git a/crates/async_zip/src/tests/read/locator/empty-buffer-boundary.zip b/crates/async_zip/src/tests/read/locator/empty-buffer-boundary.zip new file mode 100644 index 0000000..b6b7174 Binary files /dev/null and b/crates/async_zip/src/tests/read/locator/empty-buffer-boundary.zip differ diff --git a/crates/async_zip/src/tests/read/locator/empty-with-max-comment.zip b/crates/async_zip/src/tests/read/locator/empty-with-max-comment.zip new file mode 100644 index 0000000..fc1f498 Binary files /dev/null and b/crates/async_zip/src/tests/read/locator/empty-with-max-comment.zip differ diff --git a/crates/async_zip/src/tests/read/locator/empty.zip b/crates/async_zip/src/tests/read/locator/empty.zip new file mode 100644 index 0000000..15cb0ec Binary files /dev/null and b/crates/async_zip/src/tests/read/locator/empty.zip differ diff --git a/crates/async_zip/src/tests/read/locator/mod.rs b/crates/async_zip/src/tests/read/locator/mod.rs new file mode 100644 index 0000000..d08950c --- /dev/null +++ b/crates/async_zip/src/tests/read/locator/mod.rs @@ -0,0 +1,64 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +#[test] +fn search_one_byte_test() { + let buffer: &[u8] = &[0x0, 0x0, 0x0, 0x0, 0x0, 0x0]; + let signature: &[u8] = &[0x1]; + + let matched = crate::base::read::io::locator::reverse_search_buffer(buffer, signature); + assert!(matched.is_none()); + + let buffer: &[u8] = &[0x2, 0x1, 0x0, 0x0, 0x0, 0x0]; + let signature: &[u8] = &[0x1]; + + let matched = crate::base::read::io::locator::reverse_search_buffer(buffer, signature); + assert!(matched.is_some()); + assert_eq!(1, matched.unwrap()); +} + +#[test] +fn search_two_byte_test() { + let buffer: &[u8] = &[0x2, 0x1, 0x0, 0x0, 0x0, 0x0]; + let signature: &[u8] = &[0x2, 0x1]; + + let matched = crate::base::read::io::locator::reverse_search_buffer(buffer, signature); + assert!(matched.is_some()); + assert_eq!(1, matched.unwrap()); +} + +#[tokio::test] +async fn locator_empty_test() { + use futures_lite::io::Cursor; + + let data = &include_bytes!("empty.zip"); + let mut cursor = Cursor::new(data); + let eocdr = crate::base::read::io::locator::eocdr(&mut cursor).await; + + assert!(eocdr.is_ok()); + assert_eq!(eocdr.unwrap(), 4); +} + +#[tokio::test] +async fn locator_empty_max_comment_test() { + use futures_lite::io::Cursor; + + let data = &include_bytes!("empty-with-max-comment.zip"); + let mut cursor = Cursor::new(data); + let eocdr = crate::base::read::io::locator::eocdr(&mut cursor).await; + + assert!(eocdr.is_ok()); + assert_eq!(eocdr.unwrap(), 4); +} + +#[tokio::test] +async fn locator_buffer_boundary_test() { + use futures_lite::io::Cursor; + + let data = &include_bytes!("empty-buffer-boundary.zip"); + let mut cursor = Cursor::new(data); + let eocdr = crate::base::read::io::locator::eocdr(&mut cursor).await; + + assert!(eocdr.is_ok()); + assert_eq!(eocdr.unwrap(), 4); +} diff --git a/crates/async_zip/src/tests/read/mod.rs b/crates/async_zip/src/tests/read/mod.rs new file mode 100644 index 0000000..9c5f507 --- /dev/null +++ b/crates/async_zip/src/tests/read/mod.rs @@ -0,0 +1,6 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod compression; +pub(crate) mod locator; +pub(crate) mod zip64; diff --git a/crates/async_zip/src/tests/read/zip64/mod.rs b/crates/async_zip/src/tests/read/zip64/mod.rs new file mode 100644 index 0000000..758d410 --- /dev/null +++ b/crates/async_zip/src/tests/read/zip64/mod.rs @@ -0,0 +1,107 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// Copyright (c) 2023 Cognite AS +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use futures_lite::io::AsyncReadExt; + +use crate::tests::init_logger; + +const ZIP64_ZIP_CONTENTS: &str = "Hello World!\n"; + +/// Tests opening and reading a zip64 archive. +/// It contains one file named "-" with a zip 64 extended field header. +#[tokio::test] +async fn test_read_zip64_archive_mem() { + use crate::base::read::mem::ZipFileReader; + init_logger(); + + let data = include_bytes!("zip64.zip").to_vec(); + + let reader = ZipFileReader::new(data).await.unwrap(); + let mut entry_reader = reader.reader_without_entry(0).await.unwrap(); + + let mut read_data = String::new(); + entry_reader.read_to_string(&mut read_data).await.expect("read failed"); + + assert_eq!( + read_data.chars().count(), + ZIP64_ZIP_CONTENTS.chars().count(), + "{read_data:?} != {ZIP64_ZIP_CONTENTS:?}" + ); + assert_eq!(read_data, ZIP64_ZIP_CONTENTS); +} + +/// Like test_read_zip64_archive_mem() but for the streaming version +#[tokio::test] +async fn test_read_zip64_archive_stream() { + use crate::base::read::stream::ZipFileReader; + init_logger(); + + let data = include_bytes!("zip64.zip").to_vec(); + + let reader = ZipFileReader::new(data.as_slice()); + let mut entry_reader = reader.next_without_entry().await.unwrap().unwrap(); + + let mut read_data = String::new(); + entry_reader.reader_mut().read_to_string(&mut read_data).await.expect("read failed"); + + assert_eq!( + read_data.chars().count(), + ZIP64_ZIP_CONTENTS.chars().count(), + "{read_data:?} != {ZIP64_ZIP_CONTENTS:?}" + ); + assert_eq!(read_data, ZIP64_ZIP_CONTENTS); +} + +/// Generate an example file only if it doesn't exist already. +/// The file is placed adjacent to this rs file. +#[cfg(feature = "tokio")] +fn generate_zip64many_zip() -> std::path::PathBuf { + use std::io::Write; + use zip::write::FileOptions; + + let mut path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("src/tests/read/zip64/zip64many.zip"); + + // Only recreate the zip if it doesnt already exist. + if path.exists() { + return path; + } + + let zip_file = std::fs::File::create(&path).unwrap(); + let mut zip = zip::ZipWriter::new(zip_file); + let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored); + + for i in 0..2_u32.pow(16) + 1 { + zip.start_file(format!("{i}.txt"), options).unwrap(); + zip.write_all(b"\n").unwrap(); + } + + zip.finish().unwrap(); + + path +} + +/// Test reading a generated zip64 archive that contains more than 2^16 entries. +#[cfg(feature = "tokio-fs")] +#[tokio::test] +async fn test_read_zip64_archive_many_entries() { + use crate::tokio::read::fs::ZipFileReader; + + init_logger(); + + let path = generate_zip64many_zip(); + + let reader = ZipFileReader::new(path).await.unwrap(); + + // Verify that each entry exists and is has the contents "\n" + for i in 0..2_u32.pow(16) + 1 { + let entry = reader.file().entries().get(i as usize).unwrap(); + eprintln!("{:?}", entry.filename().as_bytes()); + assert_eq!(entry.filename.as_str().unwrap(), format!("{i}.txt")); + let mut entry = reader.reader_without_entry(i as usize).await.unwrap(); + let mut contents = String::new(); + entry.read_to_string(&mut contents).await.unwrap(); + assert_eq!(contents, "\n"); + } +} diff --git a/crates/async_zip/src/tests/read/zip64/zip64.zip b/crates/async_zip/src/tests/read/zip64/zip64.zip new file mode 100644 index 0000000..b07a4d4 Binary files /dev/null and b/crates/async_zip/src/tests/read/zip64/zip64.zip differ diff --git a/crates/async_zip/src/tests/spec/date.rs b/crates/async_zip/src/tests/spec/date.rs new file mode 100644 index 0000000..151bde4 --- /dev/null +++ b/crates/async_zip/src/tests/spec/date.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +#[cfg(feature = "chrono")] +use chrono::{TimeZone, Utc}; + +use crate::ZipDateTimeBuilder; + +#[test] +#[cfg(feature = "chrono")] +fn date_conversion_test_chrono() { + let original_dt = Utc.timestamp_opt(1666544102, 0).unwrap(); + let zip_dt = crate::ZipDateTime::from_chrono(&original_dt); + let result_dt = zip_dt.as_chrono().single().expect("expected single unique result"); + assert_eq!(result_dt, original_dt); +} + +#[test] +fn date_conversion_test() { + let year = 2000; + let month = 9; + let day = 8; + let hour = 7; + let minute = 5; + let second = 4; + + let mut builder = ZipDateTimeBuilder::new(); + + builder = builder.year(year); + builder = builder.month(month); + builder = builder.day(day); + builder = builder.hour(hour); + builder = builder.minute(minute); + builder = builder.second(second); + + let built = builder.build(); + + assert_eq!(year, built.year()); + assert_eq!(month, built.month()); + assert_eq!(day, built.day()); + assert_eq!(hour, built.hour()); + assert_eq!(minute, built.minute()); + assert_eq!(second, built.second()); +} diff --git a/crates/async_zip/src/tests/spec/mod.rs b/crates/async_zip/src/tests/spec/mod.rs new file mode 100644 index 0000000..162826e --- /dev/null +++ b/crates/async_zip/src/tests/spec/mod.rs @@ -0,0 +1,4 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +pub(crate) mod date; diff --git a/crates/async_zip/src/tests/write/mod.rs b/crates/async_zip/src/tests/write/mod.rs new file mode 100644 index 0000000..6ca7571 --- /dev/null +++ b/crates/async_zip/src/tests/write/mod.rs @@ -0,0 +1,29 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use futures_lite::io::AsyncWrite; +use std::io::Error; +use std::pin::Pin; +use std::task::{Context, Poll}; + +pub(crate) mod offset; +mod zip64; + +/// /dev/null for AsyncWrite. +/// Useful for tests that involve writing, but not reading, large amounts of data. +pub(crate) struct AsyncSink; + +// AsyncSink is always ready to receive bytes and throw them away. +impl AsyncWrite for AsyncSink { + fn poll_write(self: Pin<&mut Self>, _: &mut Context<'_>, buf: &[u8]) -> Poll> { + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn poll_close(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } +} diff --git a/crates/async_zip/src/tests/write/offset/mod.rs b/crates/async_zip/src/tests/write/offset/mod.rs new file mode 100644 index 0000000..5ee9811 --- /dev/null +++ b/crates/async_zip/src/tests/write/offset/mod.rs @@ -0,0 +1,22 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::base::write::io::offset::AsyncOffsetWriter; + +#[tokio::test] +async fn basic() { + use futures_lite::io::AsyncWriteExt; + use futures_lite::io::Cursor; + + let mut writer = AsyncOffsetWriter::new(Cursor::new(Vec::new())); + assert_eq!(writer.offset(), 0); + + writer.write_all(b"Foo. Bar. Foo. Bar.").await.expect("failed to write data"); + assert_eq!(writer.offset(), 19); + + writer.write_all(b"Foo. Foo.").await.expect("failed to write data"); + assert_eq!(writer.offset(), 28); + + writer.write_all(b"Bar. Bar.").await.expect("failed to write data"); + assert_eq!(writer.offset(), 37); +} diff --git a/crates/async_zip/src/tests/write/zip64/mod.rs b/crates/async_zip/src/tests/write/zip64/mod.rs new file mode 100644 index 0000000..01f3211 --- /dev/null +++ b/crates/async_zip/src/tests/write/zip64/mod.rs @@ -0,0 +1,243 @@ +// Copyright Cognite AS, 2023 + +use crate::base::write::ZipFileWriter; +use crate::error::{Zip64ErrorCase, ZipError}; +use crate::spec::consts::NON_ZIP64_MAX_SIZE; +use crate::tests::init_logger; +use crate::tests::write::AsyncSink; +use crate::{Compression, ZipEntryBuilder}; +use std::io::Read; + +use crate::spec::header::ExtraField; +use futures_lite::io::AsyncWriteExt; + +// Useful constants for writing a large file. +const BATCH_SIZE: usize = 100_000; +const NUM_BATCHES: usize = NON_ZIP64_MAX_SIZE as usize / BATCH_SIZE + 1; +const BATCHED_FILE_SIZE: usize = NUM_BATCHES * BATCH_SIZE; + +/// Test writing a small zip64 file. +/// No zip64 extra fields will be emitted for EntryWhole. +/// Z64 end of directory record & locator should be emitted +#[tokio::test] +async fn test_write_zip64_file() { + init_logger(); + + let mut buffer = Vec::new(); + let mut writer = ZipFileWriter::new(&mut buffer).force_zip64(); + let entry = ZipEntryBuilder::new("file1".to_string().into(), Compression::Stored); + writer.write_entry_whole(entry, &[0, 0, 0, 0]).await.unwrap(); + let entry = ZipEntryBuilder::new("file2".to_string().into(), Compression::Stored); + let mut entry_writer = writer.write_entry_stream(entry).await.unwrap(); + entry_writer.write_all(&[0, 0, 0, 0]).await.unwrap(); + entry_writer.close().await.unwrap(); + writer.close().await.unwrap(); + + let cursor = std::io::Cursor::new(buffer); + let mut zip = zip::read::ZipArchive::new(cursor).unwrap(); + let mut file1 = zip.by_name("file1").unwrap(); + assert_eq!(file1.extra_data(), &[] as &[u8]); + let mut buffer = Vec::new(); + file1.read_to_end(&mut buffer).unwrap(); + assert_eq!(buffer.as_slice(), &[0, 0, 0, 0]); + drop(file1); + + let mut file2 = zip.by_name("file2").unwrap(); + let mut buffer = Vec::new(); + file2.read_to_end(&mut buffer).unwrap(); + assert_eq!(buffer.as_slice(), &[0, 0, 0, 0]); +} + +/// Test writing a large zip64 file. This test will use upwards of 4GB of memory. +#[tokio::test] +async fn test_write_large_zip64_file() { + init_logger(); + + // Allocate space with some extra for metadata records + let mut buffer = Vec::with_capacity(BATCHED_FILE_SIZE + 100_000); + let mut writer = ZipFileWriter::new(&mut buffer); + + // Stream-written zip files are dubiously spec-conformant. We need to specify a valid file size + // in order for rs-zip (and unzip) to correctly read these files. + let entry = ZipEntryBuilder::new("file".to_string().into(), Compression::Stored) + .size(BATCHED_FILE_SIZE as u64, BATCHED_FILE_SIZE as u64); + let mut entry_writer = writer.write_entry_stream(entry).await.unwrap(); + for _ in 0..NUM_BATCHES { + entry_writer.write_all(&[0; BATCH_SIZE]).await.unwrap(); + } + entry_writer.close().await.unwrap(); + + assert!(writer.is_zip64); + let cd_entry = writer.cd_entries.last().unwrap(); + match &cd_entry.entry.extra_fields.last().unwrap() { + ExtraField::Zip64ExtendedInformation(zip64) => { + assert_eq!(zip64.compressed_size.unwrap(), BATCHED_FILE_SIZE as u64); + assert_eq!(zip64.uncompressed_size.unwrap(), BATCHED_FILE_SIZE as u64); + } + e => panic!("Expected a Zip64 extended field, got {:?}", e), + } + assert_eq!(cd_entry.header.uncompressed_size, NON_ZIP64_MAX_SIZE); + assert_eq!(cd_entry.header.compressed_size, NON_ZIP64_MAX_SIZE); + writer.close().await.unwrap(); + + let cursor = std::io::Cursor::new(buffer); + let mut archive = zip::read::ZipArchive::new(cursor).unwrap(); + let mut file = archive.by_name("file").unwrap(); + assert_eq!(file.compression(), zip::CompressionMethod::Stored); + assert_eq!(file.size(), BATCHED_FILE_SIZE as u64); + let mut buffer = [0; 100_000]; + let mut bytes_total = 0; + loop { + let read_bytes = file.read(&mut buffer).unwrap(); + if read_bytes == 0 { + break; + } + bytes_total += read_bytes; + } + assert_eq!(bytes_total, BATCHED_FILE_SIZE); +} + +/// Test writing a file, and reading it with async-zip +#[tokio::test] +async fn test_write_large_zip64_file_self_read() { + use futures_lite::io::AsyncReadExt; + + init_logger(); + + // Allocate space with some extra for metadata records + let mut buffer = Vec::with_capacity(BATCHED_FILE_SIZE + 100_000); + let mut writer = ZipFileWriter::new(&mut buffer); + + let entry = ZipEntryBuilder::new("file".into(), Compression::Stored); + let mut entry_writer = writer.write_entry_stream(entry).await.unwrap(); + for _ in 0..NUM_BATCHES { + entry_writer.write_all(&[0; BATCH_SIZE]).await.unwrap(); + } + entry_writer.close().await.unwrap(); + writer.close().await.unwrap(); + + let reader = crate::base::read::mem::ZipFileReader::new(buffer).await.unwrap(); + assert!(reader.file().zip64); + assert_eq!(reader.file().entries[0].entry.filename().as_str().unwrap(), "file"); + assert_eq!(reader.file().entries[0].entry.compressed_size, BATCHED_FILE_SIZE as u64); + let mut entry = reader.reader_without_entry(0).await.unwrap(); + + let mut buffer = [0; 100_000]; + let mut bytes_total = 0; + loop { + let read_bytes = entry.read(&mut buffer).await.unwrap(); + if read_bytes == 0 { + break; + } + bytes_total += read_bytes; + } + assert_eq!(bytes_total, BATCHED_FILE_SIZE); +} + +/// Test writing a zip64 file with more than u16::MAX files. +#[tokio::test] +async fn test_write_zip64_file_many_entries() { + init_logger(); + + // The generated file will likely be ~3MB in size. + let mut buffer = Vec::with_capacity(3_500_000); + + let mut writer = ZipFileWriter::new(&mut buffer); + for i in 0..=u16::MAX as u32 + 1 { + let entry = ZipEntryBuilder::new(i.to_string().into(), Compression::Stored); + writer.write_entry_whole(entry, &[]).await.unwrap(); + } + assert!(writer.is_zip64); + writer.close().await.unwrap(); + + let cursor = std::io::Cursor::new(buffer); + let mut zip = zip::read::ZipArchive::new(cursor).unwrap(); + assert_eq!(zip.len(), u16::MAX as usize + 2); + + for i in 0..=u16::MAX as u32 + 1 { + let mut file = zip.by_name(&i.to_string()).unwrap(); + let mut buf = Vec::new(); + file.read_to_end(&mut buf).unwrap(); + } +} + +/// Tests that EntryWholeWriter switches to Zip64 mode when writing too many files for a non-Zip64. +#[tokio::test] +async fn test_zip64_when_many_files_whole() { + let mut sink = AsyncSink; + let mut writer = ZipFileWriter::new(&mut sink); + for i in 0..=u16::MAX as u32 + 1 { + let entry = ZipEntryBuilder::new(format!("{i}").into(), Compression::Stored); + writer.write_entry_whole(entry, &[]).await.unwrap() + } + assert!(writer.is_zip64); + writer.close().await.unwrap(); +} + +/// Tests that EntryStreamWriter switches to Zip64 mode when writing too many files for a non-Zip64. +#[tokio::test] +async fn test_zip64_when_many_files_stream() { + let mut sink = AsyncSink; + let mut writer = ZipFileWriter::new(&mut sink); + for i in 0..=u16::MAX as u32 + 1 { + let entry = ZipEntryBuilder::new(format!("{i}").into(), Compression::Stored); + let entrywriter = writer.write_entry_stream(entry).await.unwrap(); + entrywriter.close().await.unwrap(); + } + + assert!(writer.is_zip64); + writer.close().await.unwrap(); +} + +/// Tests that when force_no_zip64 is true, EntryWholeWriter errors when trying to write more than +/// u16::MAX files to a single archive. +#[tokio::test] +async fn test_force_no_zip64_errors_with_too_many_files_whole() { + let mut sink = AsyncSink; + let mut writer = ZipFileWriter::new(&mut sink).force_no_zip64(); + for i in 0..u16::MAX { + let entry = ZipEntryBuilder::new(format!("{i}").into(), Compression::Stored); + writer.write_entry_whole(entry, &[]).await.unwrap() + } + let entry = ZipEntryBuilder::new("65537".to_string().into(), Compression::Stored); + let result = writer.write_entry_whole(entry, &[]).await; + + assert!(matches!(result, Err(ZipError::Zip64Needed(Zip64ErrorCase::TooManyFiles)))); +} + +/// Tests that when force_no_zip64 is true, EntryStreamWriter errors when trying to write more than +/// u16::MAX files to a single archive. +#[tokio::test] +async fn test_force_no_zip64_errors_with_too_many_files_stream() { + let mut sink = AsyncSink; + let mut writer = ZipFileWriter::new(&mut sink).force_no_zip64(); + for i in 0..u16::MAX { + let entry = ZipEntryBuilder::new(format!("{i}").into(), Compression::Stored); + let entrywriter = writer.write_entry_stream(entry).await.unwrap(); + entrywriter.close().await.unwrap(); + } + let entry = ZipEntryBuilder::new("65537".to_string().into(), Compression::Stored); + let entrywriter = writer.write_entry_stream(entry).await.unwrap(); + let result = entrywriter.close().await; + + assert!(matches!(result, Err(ZipError::Zip64Needed(Zip64ErrorCase::TooManyFiles)))); +} + +/// Tests that when force_no_zip64 is true, EntryStreamWriter errors when trying to write +/// a file larger than ~4 GiB to an archive. +#[tokio::test] +async fn test_force_no_zip64_errors_with_too_large_file_stream() { + let mut sink = AsyncSink; + let mut writer = ZipFileWriter::new(&mut sink).force_no_zip64(); + + let entry = ZipEntryBuilder::new("-".to_string().into(), Compression::Stored); + let mut entrywriter = writer.write_entry_stream(entry).await.unwrap(); + + // Writing 4GB, 1kb at a time + for _ in 0..NUM_BATCHES { + entrywriter.write_all(&[0; BATCH_SIZE]).await.unwrap(); + } + let result = entrywriter.close().await; + + assert!(matches!(result, Err(ZipError::Zip64Needed(Zip64ErrorCase::LargeFile)))); +} diff --git a/crates/async_zip/src/tokio/mod.rs b/crates/async_zip/src/tokio/mod.rs new file mode 100644 index 0000000..fbcc688 --- /dev/null +++ b/crates/async_zip/src/tokio/mod.rs @@ -0,0 +1,41 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A set of [`tokio`]-specific type aliases and features. +//! +//! # Usage +//! With the `tokio` feature enabled, types from the [`base`] implementation will implement additional constructors +//! for use with [`tokio`]. These constructors internally implement conversion between the required async IO traits. +//! They are defined as: +//! - [`base::read::seek::ZipFileReader::with_tokio()`] +//! - [`base::read::stream::ZipFileReader::with_tokio()`] +//! - [`base::write::ZipFileWriter::with_tokio()`] +//! +//! As a result of Rust's type inference, we are able to reuse the [`base`] implementation's types with considerable +//! ease. There only exists one caveat with their use; the types returned by these constructors contain a wrapping +//! compatibility type provided by an external crate. These compatibility types cannot be named unless you also pull in +//! the [`tokio_util`] dependency manually. This is why we've provided type aliases within this module so that they can +//! be named without needing to pull in a separate dependency. + +#[cfg(doc)] +use crate::base; +#[cfg(doc)] +use tokio; +#[cfg(doc)] +use tokio_util; + +pub mod read; + +pub mod write { + //! A module which supports writing ZIP files. + + #[cfg(doc)] + use crate::base; + use tokio_util::compat::Compat; + + /// A [`tokio`]-specific type alias for [`base::write::ZipFileWriter`]; + pub type ZipFileWriter = crate::base::write::ZipFileWriter>; + + /// A [`tokio`]-specific type alias for [`base::write::EntryStreamWriter`]; + pub type EntryStreamWriter<'a, W> = crate::base::write::EntryStreamWriter<'a, Compat>; +} diff --git a/crates/async_zip/src/tokio/read/fs.rs b/crates/async_zip/src/tokio/read/fs.rs new file mode 100644 index 0000000..c045d39 --- /dev/null +++ b/crates/async_zip/src/tokio/read/fs.rs @@ -0,0 +1,160 @@ +// Copyright (c) 2022 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A concurrent ZIP reader which acts over a file system path. +//! +//! Concurrency is achieved as a result of: +//! - Wrapping the provided path within an [`Arc`] to allow shared ownership. +//! - Constructing a new [`File`] from the path when reading. +//! +//! ### Usage +//! Unlike the [`seek`] module, we no longer hold a mutable reference to any inner reader which in turn, allows the +//! construction of concurrent [`ZipEntryReader`]s. Though, note that each individual [`ZipEntryReader`] cannot be sent +//! between thread boundaries due to the masked lifetime requirement. Therefore, the overarching [`ZipFileReader`] +//! should be cloned and moved into those contexts when needed. +//! +//! ### Concurrent Example +//! ```no_run +//! # use async_zip::tokio::read::fs::ZipFileReader; +//! # use async_zip::error::Result; +//! # use futures_lite::io::AsyncReadExt; +//! # +//! async fn run() -> Result<()> { +//! let reader = ZipFileReader::new("./foo.zip").await?; +//! let result = tokio::join!(read(&reader, 0), read(&reader, 1)); +//! +//! let data_0 = result.0?; +//! let data_1 = result.1?; +//! +//! // Use data within current scope. +//! +//! Ok(()) +//! } +//! +//! async fn read(reader: &ZipFileReader, index: usize) -> Result> { +//! let mut entry = reader.reader_without_entry(index).await?; +//! let mut data = Vec::new(); +//! entry.read_to_end(&mut data).await?; +//! Ok(data) +//! } +//! ``` +//! +//! ### Parallel Example +//! ```no_run +//! # use async_zip::tokio::read::fs::ZipFileReader; +//! # use async_zip::error::Result; +//! # use futures_lite::io::AsyncReadExt; +//! # +//! async fn run() -> Result<()> { +//! let reader = ZipFileReader::new("./foo.zip").await?; +//! +//! let handle_0 = tokio::spawn(read(reader.clone(), 0)); +//! let handle_1 = tokio::spawn(read(reader.clone(), 1)); +//! +//! let data_0 = handle_0.await.expect("thread panicked")?; +//! let data_1 = handle_1.await.expect("thread panicked")?; +//! +//! // Use data within current scope. +//! +//! Ok(()) +//! } +//! +//! async fn read(reader: ZipFileReader, index: usize) -> Result> { +//! let mut entry = reader.reader_without_entry(index).await?; +//! let mut data = Vec::new(); +//! entry.read_to_end(&mut data).await?; +//! Ok(data) +//! } +//! ``` + +#[cfg(doc)] +use crate::base::read::seek; + +use crate::base::read::io::entry::{WithEntry, WithoutEntry, ZipEntryReader}; +use crate::error::{Result, ZipError}; +use crate::file::ZipFile; + +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use tokio::fs::File; +use tokio::io::BufReader; +use tokio_util::compat::{Compat, TokioAsyncReadCompatExt}; + +struct Inner { + path: PathBuf, + file: ZipFile, +} + +/// A concurrent ZIP reader which acts over a file system path. +#[derive(Clone)] +pub struct ZipFileReader { + inner: Arc, +} + +impl ZipFileReader { + /// Constructs a new ZIP reader from a file system path. + pub async fn new

(path: P) -> Result + where + P: AsRef, + { + let file = crate::base::read::file(File::open(&path).await?.compat()).await?; + Ok(ZipFileReader::from_raw_parts(path, file)) + } + + /// Constructs a ZIP reader from a file system path and ZIP file information derived from that path. + /// + /// Providing a [`ZipFile`] that wasn't derived from that path may lead to inaccurate parsing. + pub fn from_raw_parts

(path: P, file: ZipFile) -> ZipFileReader + where + P: AsRef, + { + ZipFileReader { inner: Arc::new(Inner { path: path.as_ref().to_owned(), file }) } + } + + /// Returns this ZIP file's information. + pub fn file(&self) -> &ZipFile { + &self.inner.file + } + + /// Returns the file system path provided to the reader during construction. + pub fn path(&self) -> &Path { + &self.inner.path + } + + /// Returns a new entry reader if the provided index is valid. + pub async fn reader_without_entry( + &self, + index: usize, + ) -> Result>, WithoutEntry>> { + let stored_entry = self.inner.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + let mut fs_file = BufReader::new(File::open(&self.inner.path).await?).compat(); + + stored_entry.seek_to_data_offset(&mut fs_file).await?; + + Ok(ZipEntryReader::new_with_owned( + fs_file, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + )) + } + + /// Returns a new entry reader if the provided index is valid. + pub async fn reader_with_entry( + &self, + index: usize, + ) -> Result>, WithEntry<'_>>> { + let stored_entry = self.inner.file.entries.get(index).ok_or(ZipError::EntryIndexOutOfBounds)?; + let mut fs_file = BufReader::new(File::open(&self.inner.path).await?).compat(); + + stored_entry.seek_to_data_offset(&mut fs_file).await?; + + let reader = ZipEntryReader::new_with_owned( + fs_file, + stored_entry.entry.compression(), + stored_entry.entry.compressed_size(), + ); + + Ok(reader.into_with_entry(stored_entry)) + } +} diff --git a/crates/async_zip/src/tokio/read/mod.rs b/crates/async_zip/src/tokio/read/mod.rs new file mode 100644 index 0000000..c70ac27 --- /dev/null +++ b/crates/async_zip/src/tokio/read/mod.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +//! A module which supports reading ZIP files. + +use tokio_util::compat::Compat; + +#[cfg(feature = "tokio-fs")] +pub mod fs; +#[cfg(doc)] +use crate::base; +#[cfg(doc)] +use tokio; + +/// A [`tokio`]-specific type alias for [`base::read::ZipEntryReader`]; +pub type ZipEntryReader<'a, R, E> = crate::base::read::ZipEntryReader<'a, Compat, E>; + +pub mod seek { + //! A ZIP reader which acts over a seekable source. + use tokio_util::compat::Compat; + + #[cfg(doc)] + use crate::base; + #[cfg(doc)] + use tokio; + + /// A [`tokio`]-specific type alias for [`base::read::seek::ZipFileReader`]; + pub type ZipFileReader = crate::base::read::seek::ZipFileReader>; +} + +pub mod stream { + //! A ZIP reader which acts over a non-seekable source. + + #[cfg(doc)] + use crate::base; + #[cfg(doc)] + use tokio; + use tokio_util::compat::Compat; + + /// A [`tokio`]-specific type alias for [`base::read::stream::Reading`]; + pub type Reading<'a, R, E> = crate::base::read::stream::Reading<'a, Compat, E>; + /// A [`tokio`]-specific type alias for [`base::read::stream::Ready`]; + pub type Ready = crate::base::read::stream::Ready>; +} diff --git a/crates/async_zip/src/utils.rs b/crates/async_zip/src/utils.rs new file mode 100644 index 0000000..269d334 --- /dev/null +++ b/crates/async_zip/src/utils.rs @@ -0,0 +1,18 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use crate::error::{Result, ZipError}; +use futures_lite::io::{AsyncRead, AsyncReadExt}; + +// Assert that the next four-byte signature read by a reader which impls AsyncRead matches the expected signature. +pub(crate) async fn assert_signature(reader: &mut R, expected: u32) -> Result<()> { + let signature = { + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer).await?; + u32::from_le_bytes(buffer) + }; + match signature { + actual if actual == expected => Ok(()), + actual => Err(ZipError::UnexpectedHeaderError(actual, expected)), + } +} diff --git a/crates/async_zip/tests/common/mod.rs b/crates/async_zip/tests/common/mod.rs new file mode 100644 index 0000000..d72f02e --- /dev/null +++ b/crates/async_zip/tests/common/mod.rs @@ -0,0 +1,99 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) +#![allow(dead_code)] + +use async_zip::base::read::mem; +use async_zip::base::read::seek; +use async_zip::base::write::ZipFileWriter; +use async_zip::Compression; +use async_zip::ZipEntryBuilder; +use futures_lite::io::AsyncWriteExt; +use tokio::fs::File; +use tokio::io::BufReader; +use tokio_util::compat::TokioAsyncReadCompatExt; + +const FOLDER_PREFIX: &str = "tests/test_inputs"; + +const FILE_LIST: &[&str] = &[ + "sample_data/alpha/back_to_front.txt", + "sample_data/alpha/front_to_back.txt", + "sample_data/numeric/forward.txt", + "sample_data/numeric/reverse.txt", +]; + +pub async fn compress_to_mem(compress: Compression) -> Vec { + let mut bytes = Vec::with_capacity(10_000); + let mut writer = ZipFileWriter::new(&mut bytes); + + for fname in FILE_LIST { + let content = tokio::fs::read(format!("{FOLDER_PREFIX}/{fname}")).await.unwrap(); + let opts = ZipEntryBuilder::new(fname.to_string().into(), compress); + + let mut entry_writer = writer.write_entry_stream(opts).await.unwrap(); + entry_writer.write_all(&content).await.unwrap(); + entry_writer.close().await.unwrap(); + } + writer.close().await.unwrap(); + bytes +} + +#[cfg(feature = "tokio-fs")] +pub async fn check_decompress_fs(fname: &str) { + use async_zip::tokio::read::fs; + let zip = fs::ZipFileReader::new(fname).await.unwrap(); + let zip_entries: Vec<_> = zip.file().entries().to_vec(); + for (idx, entry) in zip_entries.into_iter().enumerate() { + // TODO: resolve unwrap usage + if entry.dir().unwrap() { + continue; + } + // TODO: resolve unwrap usage + let fname = entry.filename().as_str().unwrap(); + let mut output = String::new(); + let mut reader = zip.reader_with_entry(idx).await.unwrap(); + let _ = reader.read_to_string_checked(&mut output).await.unwrap(); + let fs_file = format!("{FOLDER_PREFIX}/{fname}"); + let expected = tokio::fs::read_to_string(fs_file).await.unwrap(); + assert_eq!(output, expected, "for {fname}, expect zip data to match file data"); + } +} + +pub async fn check_decompress_seek(fname: &str) { + let file = BufReader::new(File::open(fname).await.unwrap()); + let mut file_compat = file.compat(); + let mut zip = seek::ZipFileReader::new(&mut file_compat).await.unwrap(); + let zip_entries: Vec<_> = zip.file().entries().to_vec(); + for (idx, entry) in zip_entries.into_iter().enumerate() { + // TODO: resolve unwrap usage + if entry.dir().unwrap() { + continue; + } + // TODO: resolve unwrap usage + let fname = entry.filename().as_str().unwrap(); + let mut output = String::new(); + let mut reader = zip.reader_with_entry(idx).await.unwrap(); + let _ = reader.read_to_string_checked(&mut output).await.unwrap(); + let fs_file = format!("tests/test_inputs/{fname}"); + let expected = tokio::fs::read_to_string(fs_file).await.unwrap(); + assert_eq!(output, expected, "for {fname}, expect zip data to match file data"); + } +} + +pub async fn check_decompress_mem(zip_data: Vec) { + let zip = mem::ZipFileReader::new(zip_data).await.unwrap(); + let zip_entries: Vec<_> = zip.file().entries().to_vec(); + for (idx, entry) in zip_entries.into_iter().enumerate() { + // TODO: resolve unwrap usage + if entry.dir().unwrap() { + continue; + } + // TODO: resolve unwrap usage + let fname = entry.filename().as_str().unwrap(); + let mut output = String::new(); + let mut reader = zip.reader_with_entry(idx).await.unwrap(); + let _ = reader.read_to_string_checked(&mut output).await.unwrap(); + let fs_file = format!("{FOLDER_PREFIX}/{fname}"); + let expected = tokio::fs::read_to_string(fs_file).await.unwrap(); + assert_eq!(output, expected, "for {fname}, expect zip data to match file data"); + } +} diff --git a/crates/async_zip/tests/compress_test.rs b/crates/async_zip/tests/compress_test.rs new file mode 100644 index 0000000..9efa7e8 --- /dev/null +++ b/crates/async_zip/tests/compress_test.rs @@ -0,0 +1,81 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) + +use async_zip::{Compression, ZipEntryBuilder, ZipString}; +use futures_lite::AsyncWriteExt; + +mod common; + +#[cfg(feature = "zstd")] +#[tokio::test] +async fn zip_zstd_in_out() { + let zip_data = common::compress_to_mem(Compression::Zstd).await; + common::check_decompress_mem(zip_data).await +} + +#[cfg(feature = "deflate")] +#[tokio::test] +async fn zip_decompress_in_out() { + let zip_data = common::compress_to_mem(Compression::Deflate).await; + common::check_decompress_mem(zip_data).await +} + +#[tokio::test] +async fn zip_store_in_out() { + let zip_data = common::compress_to_mem(Compression::Stored).await; + common::check_decompress_mem(zip_data).await +} + +#[tokio::test] +async fn zip_utf8_extra_in_out_stream() { + let mut zip_bytes = Vec::with_capacity(10_000); + + { + // writing + let content = "Test".as_bytes(); + let mut writer = async_zip::base::write::ZipFileWriter::new(&mut zip_bytes); + let filename = + ZipString::new_with_alternative("\u{4E2D}\u{6587}.txt".to_string(), b"\xD6\xD0\xCe\xC4.txt".to_vec()); + let opts = ZipEntryBuilder::new(filename, Compression::Stored); + + let mut entry_writer = writer.write_entry_stream(opts).await.unwrap(); + entry_writer.write_all(content).await.unwrap(); + entry_writer.close().await.unwrap(); + + writer.close().await.unwrap(); + } + + { + // reading + let zip = async_zip::base::read::mem::ZipFileReader::new(zip_bytes).await.unwrap(); + let zip_entries: Vec<_> = zip.file().entries().to_vec(); + assert_eq!(zip_entries.len(), 1); + assert_eq!(zip_entries[0].filename().as_str().unwrap(), "\u{4E2D}\u{6587}.txt"); + assert_eq!(zip_entries[0].filename().alternative(), Some(b"\xD6\xD0\xCe\xC4.txt".as_ref())); + } +} + +#[tokio::test] +async fn zip_utf8_extra_in_out_whole() { + let mut zip_bytes = Vec::with_capacity(10_000); + + { + // writing + let content = "Test".as_bytes(); + let mut writer = async_zip::base::write::ZipFileWriter::new(&mut zip_bytes); + let filename = + ZipString::new_with_alternative("\u{4E2D}\u{6587}.txt".to_string(), b"\xD6\xD0\xCe\xC4.txt".to_vec()); + let opts = ZipEntryBuilder::new(filename, Compression::Stored); + writer.write_entry_whole(opts, content).await.unwrap(); + writer.close().await.unwrap(); + } + + { + // reading + let zip = async_zip::base::read::mem::ZipFileReader::new(zip_bytes).await.unwrap(); + let zip_entries: Vec<_> = zip.file().entries().to_vec(); + assert_eq!(zip_entries.len(), 1); + assert_eq!(zip_entries[0].filename().as_str().unwrap(), "\u{4E2D}\u{6587}.txt"); + assert_eq!(zip_entries[0].filename().alternative(), Some(b"\xD6\xD0\xCe\xC4.txt".as_ref())); + } +} diff --git a/crates/async_zip/tests/decompress_test.rs b/crates/async_zip/tests/decompress_test.rs new file mode 100644 index 0000000..8fa7acf --- /dev/null +++ b/crates/async_zip/tests/decompress_test.rs @@ -0,0 +1,89 @@ +// Copyright (c) 2023 Harry [Majored] [hello@majored.pw] +// MIT License (https://github.com/Majored/rs-async-zip/blob/main/LICENSE) +#![allow(dead_code)] + +use tokio::io::BufReader; +use tokio_util::compat::TokioAsyncReadCompatExt; + +mod common; + +const ZSTD_ZIP_FILE: &str = "tests/test_inputs/sample_data.zstd.zip"; +const DEFLATE_ZIP_FILE: &str = "tests/test_inputs/sample_data.deflate.zip"; +const STORE_ZIP_FILE: &str = "tests/test_inputs/sample_data.store.zip"; +const UTF8_EXTRA_ZIP_FILE: &str = "tests/test_inputs/sample_data_utf8_extra.zip"; + +#[cfg(feature = "zstd")] +#[tokio::test] +async fn decompress_zstd_zip_seek() { + common::check_decompress_seek(ZSTD_ZIP_FILE).await +} + +#[cfg(feature = "deflate")] +#[tokio::test] +async fn decompress_deflate_zip_seek() { + common::check_decompress_seek(DEFLATE_ZIP_FILE).await +} + +#[tokio::test] +async fn check_empty_zip_seek() { + let mut data: Vec = Vec::new(); + async_zip::base::write::ZipFileWriter::new(futures_lite::io::Cursor::new(&mut data)).close().await.unwrap(); + async_zip::base::read::seek::ZipFileReader::new(futures_lite::io::Cursor::new(&data)).await.unwrap(); +} + +#[tokio::test] +async fn decompress_store_zip_seek() { + common::check_decompress_seek(STORE_ZIP_FILE).await +} + +#[cfg(feature = "zstd")] +#[tokio::test] +async fn decompress_zstd_zip_mem() { + let content = tokio::fs::read(ZSTD_ZIP_FILE).await.unwrap(); + common::check_decompress_mem(content).await +} + +#[cfg(feature = "deflate")] +#[tokio::test] +async fn decompress_deflate_zip_mem() { + let content = tokio::fs::read(DEFLATE_ZIP_FILE).await.unwrap(); + common::check_decompress_mem(content).await +} + +#[tokio::test] +async fn decompress_store_zip_mem() { + let content = tokio::fs::read(STORE_ZIP_FILE).await.unwrap(); + common::check_decompress_mem(content).await +} + +#[cfg(feature = "zstd")] +#[cfg(feature = "tokio-fs")] +#[tokio::test] +async fn decompress_zstd_zip_fs() { + common::check_decompress_fs(ZSTD_ZIP_FILE).await +} + +#[cfg(feature = "deflate")] +#[cfg(feature = "tokio-fs")] +#[tokio::test] +async fn decompress_deflate_zip_fs() { + common::check_decompress_fs(DEFLATE_ZIP_FILE).await +} + +#[cfg(feature = "tokio-fs")] +#[tokio::test] +async fn decompress_store_zip_fs() { + common::check_decompress_fs(STORE_ZIP_FILE).await +} + +#[tokio::test] +async fn decompress_zip_with_utf8_extra() { + let file = BufReader::new(tokio::fs::File::open(UTF8_EXTRA_ZIP_FILE).await.unwrap()); + let mut file_compat = file.compat(); + let zip = async_zip::base::read::seek::ZipFileReader::new(&mut file_compat).await.unwrap(); + let zip_entries: Vec<_> = zip.file().entries().to_vec(); + assert_eq!(zip_entries.len(), 1); + assert_eq!(zip_entries[0].header_size(), 93); + assert_eq!(zip_entries[0].filename().as_str().unwrap(), "\u{4E2D}\u{6587}.txt"); + assert_eq!(zip_entries[0].filename().alternative(), Some(b"\xD6\xD0\xCe\xC4.txt".as_ref())); +} diff --git a/crates/async_zip/tests/test_inputs/sample_data.deflate.zip b/crates/async_zip/tests/test_inputs/sample_data.deflate.zip new file mode 100644 index 0000000..f7cb773 Binary files /dev/null and b/crates/async_zip/tests/test_inputs/sample_data.deflate.zip differ diff --git a/crates/async_zip/tests/test_inputs/sample_data.store.zip b/crates/async_zip/tests/test_inputs/sample_data.store.zip new file mode 100644 index 0000000..1f4d53c Binary files /dev/null and b/crates/async_zip/tests/test_inputs/sample_data.store.zip differ diff --git a/crates/async_zip/tests/test_inputs/sample_data.zstd.zip b/crates/async_zip/tests/test_inputs/sample_data.zstd.zip new file mode 100644 index 0000000..614293c Binary files /dev/null and b/crates/async_zip/tests/test_inputs/sample_data.zstd.zip differ diff --git a/crates/async_zip/tests/test_inputs/sample_data/alpha/back_to_front.txt b/crates/async_zip/tests/test_inputs/sample_data/alpha/back_to_front.txt new file mode 100644 index 0000000..5f84448 --- /dev/null +++ b/crates/async_zip/tests/test_inputs/sample_data/alpha/back_to_front.txt @@ -0,0 +1,4 @@ +Z,z,Y,y,X,x,W,w,V,v,U,u,T,t,S,s,R,r,Q,q,P,p,O,o,N,n,M,m,L,l,K,k,J,j,I,I,H,h,G,g,F,f,E,e,D,d,C,c,B,b,A,a +Z,z,Y,y,X,x,W,w,V,v,U,u,T,t,S,s,R,r,Q,q,P,p,O,o,N,n,M,m,L,l,K,k,J,j,I,I,H,h,G,g,F,f,E,e,D,d,C,c,B,b,A,a +Z,z,Y,y,X,x,W,w,V,v,U,u,T,t,S,s,R,r,Q,q,P,p,O,o,N,n,M,m,L,l,K,k,J,j,I,I,H,h,G,g,F,f,E,e,D,d,C,c,B,b,A,a +Z,z,Y,y,X,x,W,w,V,v,U,u,T,t,S,s,R,r,Q,q,P,p,O,o,N,n,M,m,L,l,K,k,J,j,I,I,H,h,G,g,F,f,E,e,D,d,C,c,B,b,A,a diff --git a/crates/async_zip/tests/test_inputs/sample_data/alpha/front_to_back.txt b/crates/async_zip/tests/test_inputs/sample_data/alpha/front_to_back.txt new file mode 100644 index 0000000..3d01a65 --- /dev/null +++ b/crates/async_zip/tests/test_inputs/sample_data/alpha/front_to_back.txt @@ -0,0 +1,4 @@ +A,a,B,b,C,c,D,d,E,e,F,f,G,g,H,h,I,I,J,j,K,k,L,l,M,m,N,n,O,o,P,p,Q,q,R,r,S,s,T,t,U,u,V,v,W,w,X,x,Y,y,Z,z +A,a,B,b,C,c,D,d,E,e,F,f,G,g,H,h,I,I,J,j,K,k,L,l,M,m,N,n,O,o,P,p,Q,q,R,r,S,s,T,t,U,u,V,v,W,w,X,x,Y,y,Z,z +A,a,B,b,C,c,D,d,E,e,F,f,G,g,H,h,I,I,J,j,K,k,L,l,M,m,N,n,O,o,P,p,Q,q,R,r,S,s,T,t,U,u,V,v,W,w,X,x,Y,y,Z,z +A,a,B,b,C,c,D,d,E,e,F,f,G,g,H,h,I,I,J,j,K,k,L,l,M,m,N,n,O,o,P,p,Q,q,R,r,S,s,T,t,U,u,V,v,W,w,X,x,Y,y,Z,z diff --git a/crates/async_zip/tests/test_inputs/sample_data/numeric/forward.txt b/crates/async_zip/tests/test_inputs/sample_data/numeric/forward.txt new file mode 100644 index 0000000..836aa3d --- /dev/null +++ b/crates/async_zip/tests/test_inputs/sample_data/numeric/forward.txt @@ -0,0 +1 @@ +1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32 diff --git a/crates/async_zip/tests/test_inputs/sample_data/numeric/reverse.txt b/crates/async_zip/tests/test_inputs/sample_data/numeric/reverse.txt new file mode 100644 index 0000000..2110808 --- /dev/null +++ b/crates/async_zip/tests/test_inputs/sample_data/numeric/reverse.txt @@ -0,0 +1 @@ +32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 diff --git a/crates/async_zip/tests/test_inputs/sample_data_utf8_extra.zip b/crates/async_zip/tests/test_inputs/sample_data_utf8_extra.zip new file mode 100644 index 0000000..cec7282 Binary files /dev/null and b/crates/async_zip/tests/test_inputs/sample_data_utf8_extra.zip differ diff --git a/crates/envy/Cargo.toml b/crates/envy/Cargo.toml new file mode 100644 index 0000000..531d999 --- /dev/null +++ b/crates/envy/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "envy" +version = "0.4.2" +authors = ["softprops "] +description = "deserialize env vars into typesafe structs" +documentation = "https://softprops.github.io/envy" +homepage = "https://github.com/softprops/envy" +repository = "https://github.com/softprops/envy" +keywords = ["serde", "env"] +license = "MIT" +readme = "README.md" +edition = "2021" +categories = [ + "config" +] + +[dependencies] +serde = "1.0" + +[dev-dependencies] +serde = { version = "1.0", features = ["derive"] } diff --git a/crates/envy/src/error.rs b/crates/envy/src/error.rs new file mode 100644 index 0000000..3d0fec1 --- /dev/null +++ b/crates/envy/src/error.rs @@ -0,0 +1,55 @@ +//! Error types +use serde::de::Error as SerdeError; +use std::{error::Error as StdError, fmt}; + +/// Types of errors that may result from failed attempts +/// to deserialize a type from env vars +#[derive(Debug, Clone, PartialEq)] +pub enum Error { + MissingValue(String), + Custom(String), +} + +impl StdError for Error {} + +impl fmt::Display for Error { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::MissingValue(field) => write!(fmt, "missing value for {}", &field), + Error::Custom(ref msg) => write!(fmt, "{}", msg), + } + } +} + +impl SerdeError for Error { + fn custom(msg: T) -> Self { + Error::Custom(format!("{}", msg)) + } + + fn missing_field(field: &'static str) -> Error { + Error::MissingValue(field.into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn impl_std_error(_: E) {} + + #[test] + fn error_impl_std_error() { + impl_std_error(Error::MissingValue("FOO_BAR".into())); + impl_std_error(Error::Custom("whoops".into())) + } + + #[test] + fn error_display() { + assert_eq!( + format!("{}", Error::MissingValue("FOO_BAR".into())), + "missing value for FOO_BAR" + ); + + assert_eq!(format!("{}", Error::Custom("whoops".into())), "whoops") + } +} diff --git a/crates/envy/src/lib.rs b/crates/envy/src/lib.rs new file mode 100644 index 0000000..75835d9 --- /dev/null +++ b/crates/envy/src/lib.rs @@ -0,0 +1,560 @@ +//! Envy is a library for deserializing environment variables into typesafe structs +//! +//! # Examples +//! +//! A typical usecase for envy is deserializing configuration store in an process' environment into a struct +//! whose fields map to the names of env vars. +//! +//! Serde makes it easy to provide a deserializable struct with its [deriveable Deserialize](https://serde.rs/derive.html) +//! procedural macro. +//! +//! Simply ask for an instance of that struct from envy's `from_env` function. +//! +//! ```no_run +//! use serde::Deserialize; +//! +//! #[derive(Deserialize, Debug)] +//! struct Config { +//! foo: u16, +//! bar: bool, +//! baz: String, +//! boom: Option, +//! } +//! +//! match envy::from_env::() { +//! Ok(config) => println!("{:#?}", config), +//! Err(error) => eprintln!("{:#?}", error), +//! } +//! ``` +//! +//! Special treatment is given to collections. For config fields that store a `Vec` of values, +//! use an env var that uses a comma separated value. +//! +//! All serde modifiers should work as is. +//! +//! Enums with unit variants can be used as values: +//! +//! ```no_run +//! # use serde::Deserialize; +//! +//! #[derive(Deserialize, Debug, PartialEq)] +//! #[serde(rename_all = "lowercase")] +//! pub enum Size { +//! Small, +//! Medium, +//! Large, +//! } +//! +//! #[derive(Deserialize, Debug)] +//! struct Config { +//! size: Size, +//! } +//! +//! // set env var for size as `SIZE=medium` +//! match envy::from_env::() { +//! Ok(config) => println!("{:#?}", config), +//! Err(error) => eprintln!("{:#?}", error), +//! } +//! ``` + +use serde::de::{ + self, + value::{MapDeserializer, SeqDeserializer}, + IntoDeserializer, +}; +use std::{ + borrow::Cow, + env, + iter::{empty, IntoIterator}, +}; + +// Ours +mod error; +pub use crate::error::Error; + +/// A type result type specific to `envy::Errors` +pub type Result = std::result::Result; + +struct Vars(Iter) +where + Iter: IntoIterator; + +struct Val(String, String); + +impl<'de> IntoDeserializer<'de, Error> for Val { + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +struct VarName(String); + +impl<'de> IntoDeserializer<'de, Error> for VarName { + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +impl> Iterator for Vars { + type Item = (VarName, Val); + + fn next(&mut self) -> Option { + self.0 + .next() + .map(|(k, v)| (VarName(k.to_lowercase()), Val(k, v))) + } +} + +macro_rules! forward_parsed_values { + ($($ty:ident => $method:ident,)*) => { + $( + fn $method(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + match self.1.parse::<$ty>() { + Ok(val) => val.into_deserializer().$method(visitor), + Err(e) => Err(de::Error::custom(format_args!("{} while parsing value '{}' provided by {}", e, self.1, self.0))) + } + } + )* + } +} + +impl<'de> de::Deserializer<'de> for Val { + type Error = Error; + fn deserialize_any(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + self.1.into_deserializer().deserialize_any(visitor) + } + + fn deserialize_seq(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + // std::str::split doesn't work as expected for our use case: when we + // get an empty string we want to produce an empty Vec, but split would + // still yield an iterator with an empty string in it. So we need to + // special case empty strings. + if self.1.is_empty() { + SeqDeserializer::new(empty::()).deserialize_seq(visitor) + } else { + let values = self + .1 + .split(',') + .map(|v| Val(self.0.clone(), v.trim().to_owned())); + SeqDeserializer::new(values).deserialize_seq(visitor) + } + } + + fn deserialize_option(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + if self.1.is_empty() { + visitor.visit_none() + } else { + visitor.visit_some(self) + } + } + + forward_parsed_values! { + u8 => deserialize_u8, + u16 => deserialize_u16, + u32 => deserialize_u32, + u64 => deserialize_u64, + u128 => deserialize_u128, + i8 => deserialize_i8, + i16 => deserialize_i16, + i32 => deserialize_i32, + i64 => deserialize_i64, + i128 => deserialize_i128, + f32 => deserialize_f32, + f64 => deserialize_f64, + } + + fn deserialize_bool(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + if self.1 == "1" || self.1.eq_ignore_ascii_case("true") { + visitor.visit_bool(true) + } else if self.1 == "0" || self.0.eq_ignore_ascii_case("false") { + visitor.visit_bool(false) + } else { + Err(de::Error::custom(format_args!( + "error parsing boolean value: '{}'", + self.1 + ))) + } + } + + #[inline] + fn deserialize_newtype_struct(self, _: &'static str, visitor: V) -> Result + where + V: serde::de::Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + fn deserialize_enum( + self, + _name: &'static str, + _variants: &'static [&'static str], + visitor: V, + ) -> Result + where + V: de::Visitor<'de>, + { + visitor.visit_enum(self.1.into_deserializer()) + } + + serde::forward_to_deserialize_any! { + char str string unit + bytes byte_buf map unit_struct tuple_struct + identifier tuple ignored_any + struct + } +} + +impl<'de> de::Deserializer<'de> for VarName { + type Error = Error; + fn deserialize_any(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + self.0.into_deserializer().deserialize_any(visitor) + } + + #[inline] + fn deserialize_newtype_struct(self, _: &'static str, visitor: V) -> Result + where + V: serde::de::Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + serde::forward_to_deserialize_any! { + char str string unit seq option + bytes byte_buf map unit_struct tuple_struct + identifier tuple ignored_any enum + struct bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 + } +} + +/// A deserializer for env vars +struct Deserializer<'de, Iter: Iterator> { + inner: MapDeserializer<'de, Vars, Error>, +} + +impl<'de, Iter: Iterator> Deserializer<'de, Iter> { + fn new(vars: Iter) -> Self { + Deserializer { + inner: MapDeserializer::new(Vars(vars)), + } + } +} + +impl<'de, Iter: Iterator> de::Deserializer<'de> + for Deserializer<'de, Iter> +{ + type Error = Error; + fn deserialize_any(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + self.deserialize_map(visitor) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + visitor.visit_map(self.inner) + } + + serde::forward_to_deserialize_any! { + bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq + bytes byte_buf unit_struct tuple_struct + identifier tuple ignored_any option newtype_struct enum + struct + } +} + +/// Deserializes a type based on information stored in env variables +pub fn from_env() -> Result +where + T: de::DeserializeOwned, +{ + from_iter(env::vars()) +} + +/// Deserializes a type based on an iterable of `(String, String)` +/// representing keys and values +pub fn from_iter(iter: Iter) -> Result +where + T: de::DeserializeOwned, + Iter: IntoIterator, +{ + T::deserialize(Deserializer::new(iter.into_iter())).map_err(|error| match error { + Error::MissingValue(value) => Error::MissingValue(value.to_uppercase()), + _ => error, + }) +} + +/// A type which filters env vars with a prefix for use as serde field inputs +/// +/// These types are created with with the [prefixed](fn.prefixed.html) module function +pub struct Prefixed<'a>(Cow<'a, str>); + +impl<'a> Prefixed<'a> { + /// Deserializes a type based on prefixed env variables + pub fn from_env(&self) -> Result + where + T: de::DeserializeOwned, + { + self.from_iter(env::vars()) + } + + /// Deserializes a type based on prefixed (String, String) tuples + pub fn from_iter(&self, iter: Iter) -> Result + where + T: de::DeserializeOwned, + Iter: IntoIterator, + { + crate::from_iter(iter.into_iter().filter_map(|(k, v)| { + if k.starts_with(self.0.as_ref()) { + Some((k.trim_start_matches(self.0.as_ref()).to_owned(), v)) + } else { + None + } + })) + .map_err(|error| match error { + Error::MissingValue(value) => Error::MissingValue( + format!("{prefix}{value}", prefix = self.0, value = value).to_uppercase(), + ), + _ => error, + }) + } +} + +/// Produces a instance of `Prefixed` for prefixing env variable names +/// +/// # Example +/// +/// ```no_run +/// use serde::Deserialize; +/// +/// #[derive(Deserialize, Debug)] +/// struct Config { +/// foo: u16, +/// bar: bool, +/// baz: String, +/// boom: Option, +/// } +/// +/// // all env variables will be expected to be prefixed with APP_ +/// // i.e. APP_FOO, APP_BAR, ect +/// match envy::prefixed("APP_").from_env::() { +/// Ok(config) => println!("{:#?}", config), +/// Err(error) => eprintln!("{:#?}", error), +/// } +/// ``` +pub fn prefixed<'a, C>(prefix: C) -> Prefixed<'a> +where + C: Into>, +{ + Prefixed(prefix.into()) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde::Deserialize; + use std::collections::HashMap; + + #[derive(Default, Deserialize, Debug, PartialEq)] + #[serde(rename_all = "lowercase")] + pub enum Size { + Small, + #[default] + Medium, + Large, + } + + pub fn default_kaboom() -> u16 { + 8080 + } + + #[derive(Deserialize, Debug, PartialEq)] + pub struct CustomNewType(u32); + + #[derive(Deserialize, Debug, PartialEq)] + pub struct Foo { + bar: String, + baz: bool, + zoom: Option, + doom: Vec, + boom: Vec, + #[serde(default = "default_kaboom")] + kaboom: u16, + #[serde(default)] + debug_mode: bool, + #[serde(default)] + size: Size, + provided: Option, + newtype: CustomNewType, + } + + #[test] + fn deserialize_from_iter() { + let data = vec![ + (String::from("BAR"), String::from("test")), + (String::from("BAZ"), String::from("true")), + (String::from("DOOM"), String::from("1, 2, 3 ")), + // Empty string should result in empty vector. + (String::from("BOOM"), String::from("")), + (String::from("SIZE"), String::from("small")), + (String::from("PROVIDED"), String::from("test")), + (String::from("NEWTYPE"), String::from("42")), + ]; + match from_iter::<_, Foo>(data) { + Ok(actual) => assert_eq!( + actual, + Foo { + bar: String::from("test"), + baz: true, + zoom: None, + doom: vec![1, 2, 3], + boom: vec![], + kaboom: 8080, + debug_mode: false, + size: Size::Small, + provided: Some(String::from("test")), + newtype: CustomNewType(42) + } + ), + Err(e) => panic!("{:#?}", e), + } + } + + #[test] + fn fails_with_missing_value() { + let data = vec![ + (String::from("BAR"), String::from("test")), + (String::from("BAZ"), String::from("true")), + ]; + match from_iter::<_, Foo>(data) { + Ok(_) => panic!("expected failure"), + Err(e) => assert_eq!(e, Error::MissingValue("DOOM".into())), + } + } + + #[test] + fn prefixed_fails_with_missing_value() { + let data = vec![ + (String::from("PREFIX_BAR"), String::from("test")), + (String::from("PREFIX_BAZ"), String::from("true")), + ]; + + match prefixed("PREFIX_").from_iter::<_, Foo>(data) { + Ok(_) => panic!("expected failure"), + Err(e) => assert_eq!(e, Error::MissingValue("PREFIX_DOOM".into())), + } + } + + #[test] + fn fails_with_invalid_type() { + let data = vec![ + (String::from("BAR"), String::from("test")), + (String::from("BAZ"), String::from("notabool")), + (String::from("DOOM"), String::from("1,2,3")), + ]; + match from_iter::<_, Foo>(data) { + Ok(_) => panic!("expected failure"), + Err(e) => assert_eq!( + e, + Error::Custom(String::from("provided string was not `true` or `false` while parsing value \'notabool\' provided by BAZ")) + ), + } + } + + #[test] + fn deserializes_from_prefixed_fieldnames() { + let data = vec![ + (String::from("APP_BAR"), String::from("test")), + (String::from("APP_BAZ"), String::from("true")), + (String::from("APP_DOOM"), String::from("")), + (String::from("APP_BOOM"), String::from("4,5")), + (String::from("APP_SIZE"), String::from("small")), + (String::from("APP_PROVIDED"), String::from("test")), + (String::from("APP_NEWTYPE"), String::from("42")), + ]; + match prefixed("APP_").from_iter::<_, Foo>(data) { + Ok(actual) => assert_eq!( + actual, + Foo { + bar: String::from("test"), + baz: true, + zoom: None, + doom: vec![], + boom: vec!["4".to_string(), "5".to_string()], + kaboom: 8080, + debug_mode: false, + size: Size::Small, + provided: Some(String::from("test")), + newtype: CustomNewType(42) + } + ), + Err(e) => panic!("{:#?}", e), + } + } + + #[test] + fn prefixed_strips_prefixes() { + let mut expected = HashMap::new(); + expected.insert("foo".to_string(), "bar".to_string()); + assert_eq!( + prefixed("PRE_").from_iter(vec![("PRE_FOO".to_string(), "bar".to_string())]), + Ok(expected) + ); + } + + #[test] + fn prefixed_doesnt_parse_non_prefixed() { + let mut expected = HashMap::new(); + expected.insert("foo".to_string(), 12); + assert_eq!( + prefixed("PRE_").from_iter(vec![ + ("FOO".to_string(), "asd".to_string()), + ("PRE_FOO".to_string(), "12".to_string()) + ]), + Ok(expected) + ); + } + + #[test] + fn deserialize_optional() { + #[derive(Deserialize)] + #[serde(default)] + struct X { + val: Option, + } + + impl Default for X { + fn default() -> Self { + Self { val: Some(123) } + } + } + + let data = vec![(String::from("VAL"), String::from(""))]; + + let res = from_iter::<_, X>(data).unwrap(); + assert_eq!(res.val, None) + } +} diff --git a/resources/icon.opt.svg b/resources/icon.opt.svg new file mode 100644 index 0000000..63d9a3e --- /dev/null +++ b/resources/icon.opt.svg @@ -0,0 +1 @@ + diff --git a/resources/icon.svg b/resources/icon.svg new file mode 100644 index 0000000..718ebc4 --- /dev/null +++ b/resources/icon.svg @@ -0,0 +1,56 @@ + + + + + + + + + + + + + diff --git a/src/app.rs b/src/app.rs new file mode 100644 index 0000000..27980bf --- /dev/null +++ b/src/app.rs @@ -0,0 +1,410 @@ +use std::{ops::Bound, path::PathBuf, sync::Arc}; + +use async_zip::tokio::read::ZipEntryReader; +use axum::{ + body::Body, + extract::{Host, Request, State}, + http::{Response, Uri}, + response::{IntoResponse, Redirect}, + routing::{any, get, post}, + Form, Json, Router, +}; +use headers::HeaderMapExt; +use http::{HeaderMap, StatusCode}; +use serde::Deserialize; +use tokio::{ + fs::File, + io::{AsyncBufReadExt, AsyncReadExt, BufReader}, +}; +use tokio_util::{ + compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}, + io::ReaderStream, +}; +use tower_http::trace::{DefaultOnResponse, TraceLayer}; + +use crate::{ + artifact_api::{Artifact, ArtifactApi, ArtifactOrRun}, + cache::{Cache, CacheEntry, GetEntryResult, GetFileResult, GetFileResultFile, IndexEntry}, + config::Config, + error::{Error, Result}, + gzip_reader::{PrecompressedGzipReader, GZIP_EXTRA_LEN}, + query::Query, + templates::{self, LinkItem}, + util::{self, InsertTypedHeader}, + App, +}; + +#[derive(Clone)] +struct AppState { + i: Arc, +} + +struct AppInner { + cfg: Config, + cache: Cache, + api: ArtifactApi, +} + +impl Default for App { + fn default() -> Self { + Self::new() + } +} + +#[derive(Deserialize)] +struct UrlForm { + url: String, +} + +impl App { + pub fn new() -> Self { + Self + } + + fn new_state(&self) -> AppState { + AppState::new() + } + + pub async fn run(&self) -> Result<()> { + let address = "0.0.0.0:3000"; + let listener = tokio::net::TcpListener::bind(address).await?; + tracing::info!("Listening on http://{address}"); + + let router = Router::new() + // Prevent search indexing since artifactview serves temporary artifacts + .route( + "/robots.txt", + get(|| async { "User-agent: *\nDisallow: /\n" }), + ) + // Put the API in the .well-known folder, since it is disabled for pages + .route("/.well-known/api/artifacts", get(Self::get_artifacts)) + .route("/.well-known/api/artifact", get(Self::get_artifact)) + .route("/.well-known/api/files", get(Self::get_files)) + // Prevent access to the .well-known folder since it enables abuse + // (e.g. SSL certificate registration by an attacker) + .route("/.well-known/*path", any(|| async { Error::Inaccessible })) + // Serve artifact pages + .route("/", get(Self::get_page)) + .route("/", post(Self::post_homepage)) + .fallback(get(Self::get_page)) + .with_state(self.new_state()) + // Log requests + .layer( + TraceLayer::new_for_http() + .make_span_with(|request: &Request| { + tracing::error_span!("request", url = util::full_url_from_request(request),) + }) + .on_response(DefaultOnResponse::new().level(tracing::Level::INFO)), + ); + axum::serve(listener, router).await?; + Ok(()) + } + + async fn get_page( + State(state): State, + Host(host): Host, + uri: Uri, + request: Request, + ) -> Result> { + let subdomain = util::get_subdomain(&host, &state.i.cfg.load().root_domain)?; + + if subdomain.is_empty() { + // Main page + if uri.path() != "/" { + return Err(Error::NotFound("path".into())); + } + Ok(Response::builder() + .typed_header(headers::ContentType::html()) + .body(templates::Index::default().to_string().into())?) + } else { + let query = Query::from_subdomain(subdomain)?; + let path = percent_encoding::percent_decode_str(uri.path()).decode_utf8_lossy(); + let hdrs = request.headers(); + + let res = state.i.cache.get_entry(&state.i.api, &query).await?; + match res { + GetEntryResult::Entry { entry, zip_path } => { + match entry.get_file(&path, uri.query().unwrap_or_default())? { + GetFileResult::File(res) => { + Self::serve_artifact_file(state, entry, zip_path, res, hdrs).await + } + GetFileResult::Listing(listing) => { + if !path.ends_with('/') { + return Ok(Redirect::to(&format!("{path}/")).into_response()); + } + + // TODO: store actual artifact names + let artifact_name = format!("A{}", query.artifact.unwrap()); + + let mut path_components = vec![ + LinkItem { + name: query.shortid(), + url: state + .i + .cfg + .url_with_subdomain(&query.subdomain_with_artifact(None)), + }, + LinkItem { + name: artifact_name.to_owned(), + url: "/".to_string(), + }, + ]; + let mut buf = String::new(); + for s in path.split('/').filter(|s| !s.is_empty()) { + buf.push('/'); + buf += s; + path_components.push(LinkItem { + name: s.to_owned(), + url: buf.clone(), + }); + } + + let tmpl = templates::Listing { + main_url: state.i.cfg.main_url(), + version: templates::Version, + artifact_name: &artifact_name, + path_components, + n_dirs: listing.n_dirs, + n_files: listing.n_files, + has_parent: listing.has_parent, + entries: listing.entries, + }; + + Ok(Response::builder() + .typed_header(headers::ContentType::html()) + .body(tmpl.to_string().into())?) + } + } + } + GetEntryResult::Artifacts(artifacts) => { + if uri.path() != "/" { + return Err(Error::NotFound("path".into())); + } + if artifacts.is_empty() { + return Err(Error::NotFound("artifacts".into())); + } + let tmpl = templates::Selection { + main_url: state.i.cfg.main_url(), + run_url: &query.forge_url(), + run_name: &query.shortid(), + artifacts: artifacts + .into_iter() + .map(|a| LinkItem::from_artifact(a, &query, &state.i.cfg)) + .collect(), + }; + Ok(Response::builder() + .typed_header(headers::ContentType::html()) + .body(tmpl.to_string().into())?) + } + } + } + } + + async fn post_homepage( + State(state): State, + Host(host): Host, + Form(url): Form, + ) -> Result { + let subdomain = util::get_subdomain(&host, &state.i.cfg.load().root_domain)?; + + if subdomain.is_empty() { + let query = Query::from_forge_url(&url.url)?; + let subdomain = query.subdomain(); + let target = format!( + "{}{}.{}", + state.i.cfg.url_proto(), + subdomain, + state.i.cfg.load().root_domain + ); + Ok(Redirect::to(&target)) + } else { + Err(Error::MethodNotAllowed) + } + } + + async fn serve_artifact_file( + state: AppState, + entry: Arc, + zip_path: PathBuf, + res: GetFileResultFile, + hdrs: &HeaderMap, + ) -> Result> { + let file = res.file; + + // Dont serve files above the configured size limit + let lim = state.i.cfg.load().max_file_size; + if lim.is_some_and(|lim| file.uncompressed_size > lim) { + return Err(Error::BadRequest( + format!( + "file too large (size: {}, limit: {})", + file.uncompressed_size, + lim.unwrap() + ) + .into(), + )); + } + + let mut resp = Response::builder() + .status(res.status) + .typed_header(headers::AcceptRanges::bytes()); + if let Some(mime) = res.mime { + resp = resp.typed_header(headers::ContentType::from(mime)); + } + if let Some(last_mod) = entry.last_modified { + resp = resp.typed_header(headers::LastModified::from(last_mod)); + } + + // handle if-(un)modified queries + if let Some(modified) = entry.last_modified { + if let Some(if_unmodified_since) = hdrs.typed_get::() { + if !if_unmodified_since.precondition_passes(modified) { + return Ok(resp + .status(StatusCode::PRECONDITION_FAILED) + .body(Body::empty())?); + } + } + if let Some(if_modified_since) = hdrs.typed_get::() { + if !if_modified_since.is_modified(modified) { + return Ok(resp.status(StatusCode::NOT_MODIFIED).body(Body::empty())?); + } + } + } + + let zip_file = File::open(&zip_path).await?; + let range = hdrs.typed_get::(); + + if matches!(file.compression, async_zip::Compression::Deflate) + && range.is_none() + && util::accepts_gzip(hdrs) + { + // Read compressed file + let reader = PrecompressedGzipReader::new(zip_file, &file).await?; + resp = resp + .typed_header(headers::ContentLength( + u64::from(file.compressed_size) + GZIP_EXTRA_LEN, + )) + .typed_header(headers::ContentEncoding::gzip()); + + Ok(resp.body(Body::from_stream(ReaderStream::new(reader)))?) + } else { + // Read decompressed file + let mut zip_reader = BufReader::new(zip_file); + util::seek_to_data_offset(&mut zip_reader, file.header_offset.into()).await?; + let reader = ZipEntryReader::new_with_owned( + zip_reader.compat(), + file.compression, + file.compressed_size.into(), + ); + + if let Some(rheader) = range { + let total_len = u64::from(file.uncompressed_size); + let mut ranges = rheader.satisfiable_ranges(total_len); + if let Some(range) = ranges.next() { + if ranges.next().is_some() { + return Err(Error::BadRequest( + "multipart ranges are not implemented".into(), + )); + } + let start = match range.0 { + Bound::Included(n) => n, + Bound::Excluded(n) => n + 1, + Bound::Unbounded => 0, + }; + let end = match range.1 { + Bound::Included(n) => n + 1, + Bound::Excluded(n) => n, + Bound::Unbounded => total_len, + }; + + let mut bufreader = tokio::io::BufReader::new(reader.compat()); + + // Advance the BufReader by the parsed offset + let mut to_consume = usize::try_from(start)?; + while to_consume > 0 { + let take = bufreader.fill_buf().await?.len().min(to_consume); + bufreader.consume(take); + to_consume -= take; + } + + let content_length = end - start; + + return Ok(resp + .status(StatusCode::PARTIAL_CONTENT) + .typed_header(headers::ContentLength(content_length)) + .typed_header( + headers::ContentRange::bytes(range, total_len) + .map_err(|e| Error::Internal(e.to_string().into()))?, + ) + .body(Body::from_stream(ReaderStream::new( + bufreader.take(content_length), + )))?); + } + } + Ok(resp + .typed_header(headers::ContentLength(file.uncompressed_size.into())) + .body(Body::from_stream(ReaderStream::new(reader.compat())))?) + } + } + + /// API endpoint to list artifacts of a CI run + async fn get_artifacts( + State(state): State, + Host(host): Host, + ) -> Result>> { + let subdomain = util::get_subdomain(&host, &state.i.cfg.load().root_domain)?; + let query = Query::from_subdomain(subdomain)?; + let artifacts = state.i.api.list(&query).await?; + Ok(Json(artifacts)) + } + + /// API endpoint to get the metadata of the current artifact + async fn get_artifact( + State(state): State, + Host(host): Host, + ) -> Result> { + let subdomain = util::get_subdomain(&host, &state.i.cfg.load().root_domain)?; + let query = Query::from_subdomain(subdomain)?; + + if query.artifact.is_none() { + return Err(Error::BadRequest("no artifact specified".into())); + } + + let artifact = state.i.api.fetch(&query).await?; + match artifact { + ArtifactOrRun::Artifact(artifact) => Ok(Json(artifact)), + ArtifactOrRun::Run(_) => unreachable!(), + } + } + + /// API endpoint to get a file listing + async fn get_files( + State(state): State, + Host(host): Host, + ) -> Result>> { + let subdomain = util::get_subdomain(&host, &state.i.cfg.load().root_domain)?; + let query = Query::from_subdomain(subdomain)?; + + if query.artifact.is_none() { + return Err(Error::BadRequest("no artifact specified".into())); + } + + let res = state.i.cache.get_entry(&state.i.api, &query).await?; + let entry = match res { + GetEntryResult::Entry { entry, .. } => entry, + GetEntryResult::Artifacts(_) => unreachable!(), + }; + let files = entry.get_files(); + Ok(Json(files)) + } +} + +impl AppState { + pub fn new() -> Self { + let cfg = Config::default(); + let cache = Cache::new(cfg.clone()); + let api = ArtifactApi::new(cfg.clone()); + Self { + i: Arc::new(AppInner { cfg, cache, api }), + } + } +} diff --git a/src/artifact_api.rs b/src/artifact_api.rs index c8ee33d..e49cafa 100644 --- a/src/artifact_api.rs +++ b/src/artifact_api.rs @@ -1,10 +1,16 @@ //! API-Client to fetch CI artifacts from Github and Forgejo -use anyhow::{anyhow, Result}; -use reqwest::{header, Client, ClientBuilder, IntoUrl, RequestBuilder}; +use std::{fs::File, io::Cursor, path::Path}; + +use http::header; +use reqwest::{Client, ClientBuilder, IntoUrl, RequestBuilder, Url}; use serde::{Deserialize, Serialize}; -use crate::{config::Config, query::Query}; +use crate::{ + config::Config, + error::{Error, Result}, + query::Query, +}; pub struct ArtifactApi { http: Client, @@ -20,6 +26,11 @@ pub struct Artifact { pub download_url: String, } +pub enum ArtifactOrRun { + Artifact(Artifact), + Run(Vec), +} + #[derive(Deserialize)] struct GithubArtifact { id: u64, @@ -61,7 +72,7 @@ impl From for Artifact { } impl ForgejoArtifact { - fn to_artifact(self, id: u64, query: &Query) -> Artifact { + fn into_artifact(self, id: u64, query: &Query) -> Artifact { Artifact { download_url: format!( "https://{}/{}/{}/actions/runs/{}/artifacts/{}", @@ -92,26 +103,76 @@ impl ArtifactApi { pub async fn list(&self, query: &Query) -> Result> { if query.is_github() { - self.list_forgejo(query).await - } else { self.list_github(query).await + } else { + self.list_forgejo(query).await } } - pub async fn fetch(&self, query: &Query) -> Result { + pub async fn fetch(&self, query: &Query) -> Result { if query.is_github() { self.fetch_github(query).await } else { // Forgejo currently has no API for fetching single artifacts let mut artifacts = self.list_forgejo(query).await?; - let i = usize::try_from(query.artifact)?; - if i == 0 || i > artifacts.len() { - return Err(anyhow!("Artifact not found")); + + match query.artifact { + Some(artifact) => { + let i = usize::try_from(artifact)?; + if i == 0 || i > artifacts.len() { + return Err(Error::NotFound("artifact".into())); + } + Ok(ArtifactOrRun::Artifact(artifacts.swap_remove(i - 1))) + } + None => Ok(ArtifactOrRun::Run(artifacts)), } - Ok(artifacts.swap_remove(i - 1)) } } + pub async fn download(&self, artifact: &Artifact, path: &Path) -> Result<()> { + if artifact.expired { + return Err(Error::Expired); + } + + let lim = self.cfg.load().max_artifact_size; + let check_lim = |size: u64| { + if lim.is_some_and(|lim| u32::try_from(size).map(|size| size > lim).unwrap_or(true)) { + Err(Error::BadRequest( + format!( + "artifact too large (size: {}, limit: {})", + artifact.size, + lim.unwrap() + ) + .into(), + )) + } else { + Ok(()) + } + }; + check_lim(artifact.size)?; + + let url = Url::parse(&artifact.download_url)?; + let req = if url.domain() == Some("api.github.com") { + self.get_github(url) + } else { + self.http.get(url) + }; + + let resp = req.send().await?.error_for_status()?; + + if let Some(act_len) = resp.content_length() { + check_lim(act_len)?; + } + + let tmp_path = path.with_extension(format!("tmp.{:x}", rand::random::())); + let mut file = File::create(&tmp_path)?; + let mut content = Cursor::new(resp.bytes().await?); + std::io::copy(&mut content, &mut file)?; + std::fs::rename(&tmp_path, path)?; + tracing::info!("Downloaded artifact from {}", artifact.download_url); + Ok(()) + } + async fn list_forgejo(&self, query: &Query) -> Result> { let url = format!( "https://{}/{}/{}/actions/runs/{}/artifacts", @@ -131,7 +192,7 @@ impl ArtifactApi { .artifacts .into_iter() .enumerate() - .map(|(i, a)| a.to_artifact(i as u64 + 1, query)) + .map(|(i, a)| a.into_artifact(i as u64 + 1, query)) .collect::>(); Ok(artifacts) @@ -154,21 +215,25 @@ impl ArtifactApi { Ok(resp.artifacts.into_iter().map(Artifact::from).collect()) } - async fn fetch_github(&self, query: &Query) -> Result { - let url = format!( - "https://api.github.com/repos/{}/{}/actions/artifacts/{}", - query.user, query.repo, query.artifact - ); + async fn fetch_github(&self, query: &Query) -> Result { + match query.artifact { + Some(artifact) => { + let url = format!( + "https://api.github.com/repos/{}/{}/actions/artifacts/{}", + query.user, query.repo, artifact + ); - let artifact = self - .get_github(url) - .send() - .await? - .error_for_status()? - .json::() - .await?; - - Ok(artifact.into()) + let artifact = self + .get_github(url) + .send() + .await? + .error_for_status()? + .json::() + .await?; + Ok(ArtifactOrRun::Artifact(artifact.into())) + } + None => Ok(ArtifactOrRun::Run(self.list_github(query).await?)), + } } fn get_github(&self, url: U) -> RequestBuilder { @@ -185,7 +250,7 @@ impl ArtifactApi { mod tests { use crate::{config::Config, query::Query}; - use super::ArtifactApi; + use super::{ArtifactApi, ArtifactOrRun}; #[tokio::test] async fn fetch_forgejo() { @@ -194,14 +259,22 @@ mod tests { user: "HSA".to_owned(), repo: "Visitenbuch".to_owned(), run: 32, - artifact: 1, + artifact: Some(1), }; let api = ArtifactApi::new(Config::default()); let res = api.fetch(&query).await.unwrap(); - assert_eq!(res.name, "playwright-report"); - assert_eq!(res.download_url, "https://code.thetadev.de/HSA/Visitenbuch/actions/runs/32/artifacts/playwright-report"); - assert_eq!(res.id, 1); - assert_eq!(res.size, 574292); + + if let ArtifactOrRun::Artifact(res) = res { + assert_eq!(res.name, "playwright-report"); + assert_eq!( + res.download_url, + "https://code.thetadev.de/HSA/Visitenbuch/actions/runs/32/artifacts/playwright-report" + ); + assert_eq!(res.id, 1); + assert_eq!(res.size, 574292); + } else { + panic!("got run"); + } } #[tokio::test] @@ -211,13 +284,21 @@ mod tests { user: "actions".to_owned(), repo: "upload-artifact".to_owned(), run: 8805345396, - artifact: 1440556464, + artifact: Some(1440556464), }; let api = ArtifactApi::new(Config::default()); let res = api.fetch(&query).await.unwrap(); - assert_eq!(res.name, "Artifact-Wildcard-macos-latest"); - assert_eq!(res.download_url, "https://api.github.com/repos/actions/upload-artifact/actions/artifacts/1440556464/zip"); - assert_eq!(res.id, 1440556464); - assert_eq!(res.size, 334); + + if let ArtifactOrRun::Artifact(res) = res { + assert_eq!(res.name, "Artifact-Wildcard-macos-latest"); + assert_eq!( + res.download_url, + "https://api.github.com/repos/actions/upload-artifact/actions/artifacts/1440556464/zip" + ); + assert_eq!(res.id, 1440556464); + assert_eq!(res.size, 334); + } else { + panic!("got run"); + } } } diff --git a/src/cache.rs b/src/cache.rs new file mode 100644 index 0000000..0c5a5bf --- /dev/null +++ b/src/cache.rs @@ -0,0 +1,317 @@ +use std::{ + borrow::Cow, + collections::{BTreeMap, HashMap}, + path::{Path, PathBuf}, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use async_zip::{tokio::read::fs::ZipFileReader, Compression}; +use http::StatusCode; +use mime::Mime; +use path_macro::path; +use quick_cache::sync::Cache as QuickCache; +use serde::Serialize; +use serde_hex::{SerHex, Strict}; + +use crate::{ + artifact_api::{Artifact, ArtifactApi, ArtifactOrRun}, + config::Config, + error::{Error, Result}, + query::Query, + util, +}; + +pub struct Cache { + cfg: Config, + qc: QuickCache<[u8; 16], Arc>, +} + +pub struct CacheEntry { + pub files: HashMap, + pub last_modified: Option, +} + +#[derive(Clone)] +pub struct FileEntry { + pub header_offset: u32, + pub uncompressed_size: u32, + pub compressed_size: u32, + pub crc32: u32, + pub compression: Compression, +} + +pub enum GetEntryResult { + Entry { + entry: Arc, + zip_path: PathBuf, + }, + Artifacts(Vec), +} + +pub enum GetFileResult { + File(GetFileResultFile), + Listing(Listing), +} + +pub struct GetFileResultFile { + pub file: FileEntry, + pub mime: Option, + pub status: StatusCode, +} + +#[derive(Serialize)] +pub struct IndexEntry { + pub name: String, + pub size: u32, + #[serde(with = "SerHex::")] + pub crc32: u32, +} + +pub struct Listing { + pub entries: Vec, + pub n_files: usize, + pub n_dirs: usize, + pub has_parent: bool, +} + +pub struct ListingEntry { + pub name: String, + pub url: String, + pub size: u32, + pub crc32: String, + pub is_dir: bool, +} + +impl Cache { + pub fn new(cfg: Config) -> Self { + Self { + cfg, + qc: QuickCache::new(50), + } + } + + pub fn get_path(&self, query: &Query) -> PathBuf { + path!(self.cfg.load().cache_dir / format!("{}.zip", hex::encode(query.siphash()))) + } + + pub async fn get_entry(&self, api: &ArtifactApi, query: &Query) -> Result { + if query.artifact.is_some() { + let hash = query.siphash(); + let zip_path = path!(self.cfg.load().cache_dir / format!("{}.zip", hex::encode(hash))); + if !zip_path.is_file() { + let artifact = api.fetch(query).await?; + let artifact = match artifact { + ArtifactOrRun::Artifact(artifact) => artifact, + ArtifactOrRun::Run(_) => unreachable!(), + }; + api.download(&artifact, &zip_path).await?; + } + + let timeout = self + .cfg + .load() + .zip_timeout_ms + .map(|t| Duration::from_millis(t.into())); + let mut entry = self + .qc + .get_or_insert_async(&hash, async { + Ok::<_, Error>(Arc::new(CacheEntry::new(&zip_path, timeout).await?)) + }) + .await?; + + // Verify if the cached entry is fresh + let meta = tokio::fs::metadata(&zip_path).await?; + if meta.modified().ok() != entry.last_modified { + tracing::info!("cached file {zip_path:?} changed"); + entry = Arc::new(CacheEntry::new(&zip_path, timeout).await?); + self.qc.insert(hash, entry.clone()); + } + Ok(GetEntryResult::Entry { entry, zip_path }) + } else { + let run = api.fetch(query).await?; + let artifacts = match run { + ArtifactOrRun::Artifact(_) => unreachable!(), + ArtifactOrRun::Run(run) => run, + }; + + Ok(GetEntryResult::Artifacts(artifacts)) + } + } +} + +impl CacheEntry { + async fn new(zip_path: &Path, timeout: Option) -> Result { + let meta = tokio::fs::metadata(&zip_path).await?; + let zip_fut = ZipFileReader::new(&zip_path); + let zip = match timeout { + Some(timeout) => tokio::time::timeout(timeout, zip_fut).await??, + None => zip_fut.await?, + }; + + Ok(Self { + files: zip + .file() + .entries() + .iter() + .filter_map(|entry| { + Some(( + entry.filename().as_str().ok()?.to_owned(), + FileEntry { + header_offset: entry.header_offset().try_into().ok()?, + uncompressed_size: entry.uncompressed_size().try_into().ok()?, + compressed_size: entry.compressed_size().try_into().ok()?, + crc32: entry.crc32(), + compression: entry.compression(), + }, + )) + }) + .collect(), + last_modified: meta.modified().ok(), + }) + } + + pub fn get_file(&self, path: &str, url_query: &str) -> Result { + let path = path.trim_start_matches('/'); + let mut index_path: Option> = None; + + if path.is_empty() { + // Special case: open index.html directly + index_path = Some("index.html".into()); + } + // Attempt to access the following pages + // 1. Site path directly + // 2. Site path + `/index.html` + else if let Some(file) = self.files.get(path) { + return Ok(GetFileResult::File(GetFileResultFile { + file: file.clone(), + mime: util::path_mime(path), + status: StatusCode::OK, + })); + } else if util::site_path_ext(path).is_none() { + index_path = Some(format!("{path}/index.html").into()); + } + + if let Some(file) = index_path + .and_then(|p: Cow| self.files.get(p.as_ref())) + .or_else(|| self.files.get("200.html")) + { + // index.html or SPA entrypoint + return Ok(GetFileResult::File(GetFileResultFile { + file: file.clone(), + mime: Some(mime::TEXT_HTML), + status: StatusCode::OK, + })); + } + + // Directory listing + let path_as_dir: Cow = if path.is_empty() || path.ends_with('/') { + path.into() + } else { + format!("{path}/").into() + }; + if self + .files + .keys() + .any(|n| n.starts_with(path_as_dir.as_ref())) + { + let mut rev = false; + let mut col = b'N'; + for (k, v) in url::form_urlencoded::parse(url_query.as_bytes()) { + if k == "C" && !v.is_empty() { + col = v.as_bytes()[0]; + } else if k == "O" { + rev = v == "D"; + } + } + return Ok(GetFileResult::Listing(self.get_listing( + &path_as_dir, + col, + rev, + ))); + } else if let Some(file) = self.files.get("404.html") { + // Custom 404 error page + return Ok(GetFileResult::File(GetFileResultFile { + file: file.clone(), + mime: Some(mime::TEXT_HTML), + status: StatusCode::NOT_FOUND, + })); + } + + Err(Error::NotFound("requested file".into())) + } + + pub fn get_files(&self) -> Vec { + self.files + .iter() + .map(|(n, entry)| IndexEntry { + name: n.to_owned(), + size: entry.uncompressed_size, + crc32: entry.crc32, + }) + .collect() + } + + fn get_listing(&self, path: &str, col: u8, rev: bool) -> Listing { + let entries = self + .files + .iter() + .filter_map(|(n, entry)| { + n.strip_prefix(path).map(|n| { + let n = n.split_inclusive('/').next().unwrap(); + (n, entry) + }) + }) + .collect::>(); + + // Put directories first + let mut directories = Vec::new(); + let mut files = Vec::new(); + + let entries_iter: Box> = if col == b'N' && rev { + Box::new(entries.into_iter().rev()) + } else { + Box::new(entries.into_iter()) + }; + + for (n, entry) in entries_iter { + if n.ends_with('/') { + directories.push(ListingEntry { + name: n.to_owned(), + url: format!("{n}{path}"), + size: 0, + crc32: "-".to_string(), + is_dir: true, + }); + } else { + files.push(ListingEntry { + name: n.to_owned(), + url: format!("{n}{path}"), + size: entry.uncompressed_size, + crc32: hex::encode(entry.crc32.to_le_bytes()), + is_dir: false, + }); + } + } + + // Sort by size + if col == b'S' { + if rev { + files.sort_by(|a, b| b.size.cmp(&a.size)); + } else { + files.sort_by_key(|f| f.size); + } + } + + let n_dirs = directories.len(); + let n_files = files.len(); + directories.append(&mut files); + + Listing { + entries: directories, + n_dirs, + n_files, + has_parent: !path.is_empty(), + } + } +} diff --git a/src/config.rs b/src/config.rs index 5e241e8..e24657d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,35 +1,106 @@ -use std::sync::Arc; +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; -use arc_swap::{ArcSwap, Guard}; +use serde::{Deserialize, Serialize}; + +use crate::error::{Error, Result}; #[derive(Clone)] pub struct Config { - inner: Arc>, + inner: Arc, } -#[derive(Default)] +struct ConfigInner { + data: ConfigData, + main_url: String, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(default)] pub struct ConfigData { + pub cache_dir: PathBuf, + pub root_domain: String, + pub no_https: bool, + pub max_artifact_size: Option, + pub max_file_size: Option, + pub max_age_h: Option, + pub zip_timeout_ms: Option, pub github_token: Option, } +impl Default for ConfigData { + fn default() -> Self { + Self { + cache_dir: Path::new("/tmp/artifactview").into(), + root_domain: "localhost:3000".to_string(), + no_https: false, + max_artifact_size: Some(100_000_000), + max_file_size: Some(100_000_000), + max_age_h: Some(12), + zip_timeout_ms: Some(1000), + github_token: None, + } + } +} + impl Default for Config { fn default() -> Self { - Self::new(ConfigData::default()) + Self::new().expect("Could not initialize config") + } +} + +impl ConfigData { + fn url_proto(&self) -> &'static str { + if self.no_https { + "http://" + } else { + "https://" + } } } impl Config { - pub fn new(data: ConfigData) -> Self { - Self { - inner: Arc::new(ArcSwap::from_pointee(data)), - } + pub fn new() -> Result { + let data = + envy::from_env::().map_err(|e| Error::Internal(e.to_string().into()))?; + Self::from_data(data) } - pub fn update(&self, data: ConfigData) { - self.inner.swap(Arc::new(data)); + pub fn from_data(data: ConfigData) -> Result { + Self::before_update(&data)?; + Ok(Self { + inner: Arc::new(ConfigInner { + main_url: format!("{}{}", data.url_proto(), data.root_domain), + data, + }), + }) } - pub fn load(&self) -> Guard> { - self.inner.load() + fn before_update(data: &ConfigData) -> Result<()> { + std::fs::create_dir_all(&data.cache_dir)?; + Ok(()) + } + + pub fn load(&self) -> &ConfigData { + &self.inner.data + } + + pub fn url_proto(&self) -> &'static str { + self.inner.data.url_proto() + } + + pub fn url_with_subdomain(&self, subdomain: &str) -> String { + format!( + "{}{}.{}", + self.url_proto(), + subdomain, + self.inner.data.root_domain + ) + } + + pub fn main_url(&self) -> &str { + &self.inner.main_url } } diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..a00c8ba --- /dev/null +++ b/src/error.rs @@ -0,0 +1,88 @@ +use std::borrow::Cow; + +use axum::{ + body::Body, + response::{IntoResponse, Response}, +}; +use http::StatusCode; + +use crate::{templates, util::InsertTypedHeader}; + +pub type Result = core::result::Result; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("http client error: {0}")] + HttpClient(Cow<'static, str>, StatusCode), + #[error("http server error: {0}")] + Http(#[from] http::Error), + #[error("io: {0}")] + Io(#[from] std::io::Error), + #[error("zip: {0}")] + Zip(#[from] async_zip::error::ZipError), + #[error("internal error: {0}")] + Internal(Cow<'static, str>), + + #[error("invalid request: {0}")] + BadRequest(Cow<'static, str>), + #[error("expected URL format: -------")] + InvalidUrl, + #[error("{0} not found")] + NotFound(Cow<'static, str>), + #[error("this path cannot be accessed for security reasons")] + Inaccessible, + #[error("this artifact has already expired")] + Expired, + #[error("timeout")] + Timeout(#[from] tokio::time::error::Elapsed), + #[error("method not allowed")] + MethodNotAllowed, +} + +impl From for Error { + fn from(value: reqwest::Error) -> Self { + Self::HttpClient( + value.to_string().into(), + value.status().unwrap_or(StatusCode::INTERNAL_SERVER_ERROR), + ) + } +} + +impl From for Error { + fn from(value: std::num::TryFromIntError) -> Self { + Self::Internal(value.to_string().into()) + } +} + +impl From for Error { + fn from(value: url::ParseError) -> Self { + Self::Internal(value.to_string().into()) + } +} + +impl Error { + pub fn status(&self) -> StatusCode { + match self { + Error::BadRequest(_) | Error::InvalidUrl => StatusCode::BAD_REQUEST, + Error::NotFound(_) | Error::Inaccessible | Error::Expired => StatusCode::NOT_FOUND, + Error::HttpClient(_, status) => *status, + Error::MethodNotAllowed => StatusCode::METHOD_NOT_ALLOWED, + _ => StatusCode::INTERNAL_SERVER_ERROR, + } + } +} + +impl IntoResponse for Error { + fn into_response(self) -> axum::response::Response { + let status = self.status(); + let tmpl = templates::Error { + msg: &self.to_string(), + status: status.as_u16(), + }; + Response::builder() + .status(self.status()) + .typed_header(headers::ContentType::html()) + .body(Body::from(tmpl.to_string())) + .unwrap() + } +} diff --git a/src/gzip_reader.rs b/src/gzip_reader.rs new file mode 100644 index 0000000..9bbc6fb --- /dev/null +++ b/src/gzip_reader.rs @@ -0,0 +1,65 @@ +use std::task::Poll; + +use pin_project::pin_project; +use tokio::{ + fs::File, + io::{AsyncRead, AsyncReadExt, BufReader, Take}, +}; +use tokio_util::bytes::BufMut; + +use crate::{cache::FileEntry, error::Result, util}; + +#[pin_project] +pub struct PrecompressedGzipReader { + #[pin] + reader: Take>, + crc: u32, + uncompressed_size: u32, + state: State, +} + +pub const GZIP_EXTRA_LEN: u64 = 18; + +enum State { + Header, + Body, + Finished, +} + +impl PrecompressedGzipReader { + pub async fn new(file: File, entry: &FileEntry) -> Result { + let mut reader = BufReader::new(file); + util::seek_to_data_offset(&mut reader, entry.header_offset.into()).await?; + Ok(Self { + reader: reader.take(entry.compressed_size.into()), + crc: entry.crc32, + uncompressed_size: entry.uncompressed_size, + state: State::Header, + }) + } +} + +impl AsyncRead for PrecompressedGzipReader { + fn poll_read( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.state { + State::Header => { + buf.put_slice(&[0x1f, 0x8b, 0x08, 0, 0, 0, 0, 0, 0, 0xff]); + self.state = State::Body; + } + State::Body => { + std::task::ready!(self.as_mut().project().reader.poll_read(cx, buf))?; + if self.reader.limit() == 0 { + buf.put_u32_le(self.crc); + buf.put_u32_le(self.uncompressed_size); + self.state = State::Finished; + } + } + State::Finished => {} + } + Poll::Ready(Ok(())) + } +} diff --git a/src/lib.rs b/src/lib.rs index 5c5a809..f6cdddd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,13 @@ +#![allow(dead_code)] + +mod app; mod artifact_api; +mod cache; mod config; +mod error; +mod gzip_reader; mod query; -mod storage; +mod templates; +mod util; + +pub struct App; diff --git a/src/main.rs b/src/main.rs index e7a11a9..6309057 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,13 @@ -fn main() { - println!("Hello, world!"); +use artifactview::App; + +#[tokio::main] +async fn main() { + _ = dotenvy::dotenv(); + if std::env::var_os("RUST_LOG").is_none() { + std::env::set_var("RUST_LOG", "info"); + } + tracing_subscriber::fmt::init(); + + let app = App::new(); + app.run().await.unwrap() } diff --git a/src/query.rs b/src/query.rs index 933df96..c3de5ce 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,8 +1,13 @@ -use anyhow::{anyhow, Result}; +use std::{fmt::Write, hash::Hash}; + use once_cell::sync::Lazy; use regex::{Captures, Regex}; +use siphasher::sip128::{Hasher128, SipHasher}; +use url::Url; -#[derive(Debug, PartialEq, Eq)] +use crate::error::{Error, Result}; + +#[derive(Debug, PartialEq, Eq, Hash)] pub struct Query { /// Forge host pub host: String, @@ -13,39 +18,114 @@ pub struct Query { /// CI run id pub run: u64, /// Artifact id (unique for every run) - pub artifact: u64, + pub artifact: Option, } +static RE_REPO_NAME: Lazy = Lazy::new(|| Regex::new("^[A-z0-9\\-_\\.]+$").unwrap()); + impl Query { - pub fn from_domain(domain: &str) -> Result { - let (subdomain, _) = domain - .split_once('.') - .ok_or_else(|| anyhow!("no subdomain"))?; - - let emsg = "expected URL format: -------"; - + pub fn from_subdomain(subdomain: &str) -> Result { let segments = subdomain.split("--").collect::>(); if segments.len() != 4 { - return Err(anyhow!(emsg)); + return Err(Error::InvalidUrl); } - let (run, artifact) = segments[3] - .split_once('-') - .ok_or(anyhow!(emsg))?; + let run_and_artifact = segments[3].split('-').collect::>(); + if run_and_artifact.is_empty() || run_and_artifact.len() > 2 { + return Err(Error::InvalidUrl); + } Ok(Self { - host: Self::decode_domain(&segments[0], '.'), - user: Self::decode_domain(&segments[1], '-'), - repo: Self::decode_domain(&segments[2], '-'), - run: run.parse().ok().ok_or(anyhow!(emsg))?, - artifact: artifact.parse().ok().ok_or(anyhow!(emsg))?, + host: Self::decode_domain(segments[0], '.'), + user: Self::decode_domain(segments[1], '-'), + repo: Self::decode_domain(segments[2], '-'), + run: run_and_artifact[0].parse().ok().ok_or(Error::InvalidUrl)?, + artifact: match run_and_artifact.get(1) { + Some(x) => Some(x.parse().ok().ok_or(Error::InvalidUrl)?), + None => None, + }, }) } + pub fn from_forge_url(url: &str) -> Result { + let url = Url::parse(url).map_err(|_| Error::BadRequest("invalid URL".into()))?; + + let host = url.domain().ok_or(Error::BadRequest("no domain".into()))?; + let mut path_segs = url + .path_segments() + .ok_or(Error::BadRequest("no URL path".into()))?; + let user = path_segs + .next() + .ok_or(Error::BadRequest("no username".into()))?; + let repo = path_segs + .next() + .ok_or(Error::BadRequest("no repository".into()))?; + + if !path_segs.next().is_some_and(|s| s == "actions") + || !path_segs.next().is_some_and(|s| s == "runs") + { + return Err(Error::BadRequest("invalid Actions URL".into())); + } + if !RE_REPO_NAME.is_match(user) { + return Err(Error::BadRequest("invalid username".into())); + } + if !RE_REPO_NAME.is_match(repo) { + return Err(Error::BadRequest("invalid repository name".into())); + } + + let run = path_segs + .next() + .and_then(|s| s.parse::().ok()) + .ok_or(Error::BadRequest("no run ID".into()))?; + + Ok(Self { + host: host.to_owned(), + user: user.to_owned(), + repo: repo.to_owned(), + run, + artifact: None, + }) + } + + pub fn subdomain(&self) -> String { + self.subdomain_with_artifact(self.artifact) + } + + pub fn subdomain_with_artifact(&self, artifact: Option) -> String { + let mut res = format!( + "{}--{}--{}--{}", + Self::encode_domain(&self.host, '.'), + Self::encode_domain(&self.user, '-'), + Self::encode_domain(&self.repo, '-'), + self.run, + ); + if let Some(artifact) = artifact { + write!(res, "-{artifact}").unwrap(); + } + res + } + + pub fn shortid(&self) -> String { + format!("{}/{}#{}", self.user, self.repo, self.run) + } + + pub fn forge_url(&self) -> String { + format!( + "https://{}/{}/{}/actions/runs/{}", + self.host, self.user, self.repo, self.run + ) + } + pub fn is_github(&self) -> bool { self.host == "github.com" } + pub fn siphash(&self) -> [u8; 16] { + let mut h = SipHasher::new(); + self.hash(&mut h); + h.finish128().as_bytes() + } + fn encode_domain(s: &str, bias: char) -> String { // Check if the character at the given position is in the middle of the string // and it is not followed by escape seq numbers or further escapable characters @@ -54,7 +134,7 @@ impl Query { return false; } let next_char = s[pos..].chars().nth(1).unwrap(); - !('0'..='2').contains(&next_char) && !matches!(next_char, '-'|'.'|'_') + !('0'..='2').contains(&next_char) && !matches!(next_char, '-' | '.' | '_') }; // Escape dashes @@ -106,14 +186,10 @@ impl Query { let repl2 = if bias == '-' { repl } else { - SINGLE_DASHES.replace_all(&repl, |c: &Captures| { - bias.to_string() + &c[1] - }) + SINGLE_DASHES.replace_all(&repl, |c: &Captures| bias.to_string() + &c[1]) }; - let repl3 = repl2.replace('\0', "-"); - - repl3 + repl2.replace('\0', "-") } } @@ -121,8 +197,8 @@ impl Query { mod tests { use super::Query; - use rstest::rstest; use proptest::prelude::*; + use rstest::rstest; #[rstest] #[case("_h--de.x-u", '#', "-2h-1-1de-0x-1u")] @@ -153,9 +229,9 @@ mod tests { } #[test] - fn query_from_domain() { - let d1 = "github-com--thetadev--newpipe-extractor--14-123.example.com"; - let query = Query::from_domain(d1).unwrap(); + fn query_from_subdomain() { + let d1 = "github-com--thetadev--newpipe-extractor--14-123"; + let query = Query::from_subdomain(d1).unwrap(); assert_eq!( query, Query { @@ -163,8 +239,22 @@ mod tests { user: "thetadev".to_owned(), repo: "newpipe-extractor".to_owned(), run: 14, - artifact: 123, + artifact: Some(123), } ); + assert_eq!(query.subdomain(), d1); + } + + #[test] + fn siphash() { + let q = Query { + host: "github.com".to_owned(), + user: "thetadev".to_owned(), + repo: "newpipe-extractor".to_owned(), + run: 14, + artifact: Some(123), + }; + let hash = q.siphash(); + assert_eq!(hex::encode(hash), "e523468ef42c848155a43f40895dff5a"); } } diff --git a/src/storage.rs b/src/storage.rs deleted file mode 100644 index e69de29..0000000 diff --git a/src/templates.rs b/src/templates.rs new file mode 100644 index 0000000..cde8110 --- /dev/null +++ b/src/templates.rs @@ -0,0 +1,60 @@ +use crate::{artifact_api::Artifact, cache::ListingEntry, config::Config, query::Query}; +use yarte::{Render, Template}; + +#[derive(Default)] +pub struct Version; + +impl Render for Version { + fn render(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.write_str(env!("CARGO_PKG_VERSION")) + } +} + +#[derive(Template, Default)] +#[template(path = "index")] +pub struct Index { + pub version: Version, +} + +#[derive(Template)] +#[template(path = "error")] +pub struct Error<'a> { + pub msg: &'a str, + pub status: u16, +} + +#[derive(Template)] +#[template(path = "selection")] +pub struct Selection<'a> { + pub main_url: &'a str, + pub run_url: &'a str, + pub run_name: &'a str, + pub artifacts: Vec, +} + +#[derive(Template)] +#[template(path = "listing")] +pub struct Listing<'a> { + pub main_url: &'a str, + pub version: Version, + pub artifact_name: &'a str, + pub path_components: Vec, + pub n_dirs: usize, + pub n_files: usize, + pub has_parent: bool, + pub entries: Vec, +} + +pub struct LinkItem { + pub name: String, + pub url: String, +} + +impl LinkItem { + pub fn from_artifact(artifact: Artifact, query: &Query, cfg: &Config) -> Self { + Self { + name: artifact.name, + url: cfg.url_with_subdomain(&query.subdomain_with_artifact(Some(artifact.id))), + } + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..685dfb2 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,165 @@ +use std::io::SeekFrom; + +use async_zip::error::ZipError; +use axum::{extract::Request, http::HeaderMap}; +use headers::{Header, HeaderMapExt}; +use http::header; +use mime_guess::Mime; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; + +use crate::error::{Error, Result}; + +pub trait InsertTypedHeader { + /// Inserts a typed header to this response. + fn typed_header(self, header: T) -> Self; +} + +impl InsertTypedHeader for axum::http::response::Builder { + fn typed_header(mut self, header: T) -> Self { + if let Some(headers) = self.headers_mut() { + headers.typed_insert(header); + } + self + } +} + +pub fn accepts_gzip(headers: &HeaderMap) -> bool { + headers + .get(header::ACCEPT_ENCODING) + .and_then(|h| h.to_str().ok()) + .map(|h| { + h.split(',').any(|val| { + val.split(';') + .next() + .map(|v| { + let vt = v.trim(); + vt.eq_ignore_ascii_case("gzip") || vt == "*" + }) + .unwrap_or_default() + }) + }) + .unwrap_or_default() +} + +/// Seek to the contained compressed data within a zip file +pub async fn seek_to_data_offset( + reader: &mut R, + header_offset: u64, +) -> core::result::Result<(), ZipError> { + const LFH_SIGNATURE: u32 = 0x4034b50; + + // Seek to the header + reader.seek(SeekFrom::Start(header_offset)).await?; + + // Check the signature + let signature = { + let mut buffer = [0; 4]; + reader.read_exact(&mut buffer).await?; + u32::from_le_bytes(buffer) + }; + + match signature { + LFH_SIGNATURE => (), + actual => return Err(ZipError::UnexpectedHeaderError(actual, LFH_SIGNATURE)), + }; + + // Skip the local file header and trailing data + let mut header_data: [u8; 26] = [0; 26]; + reader.read_exact(&mut header_data).await?; + let file_name_length = u16::from_le_bytes(header_data[22..24].try_into().unwrap()); + let extra_field_length = u16::from_le_bytes(header_data[24..26].try_into().unwrap()); + + let trailing_size = (file_name_length as i64) + (extra_field_length as i64); + reader.seek(SeekFrom::Current(trailing_size)).await?; + + Ok(()) +} + +/// Return the file extension of a website path +pub fn site_path_ext(path: &str) -> Option<&str> { + let mut parts = path.split('.').rev(); + parts + .next() + .filter(|ext| !ext.contains('/') && parts.next().is_some()) +} + +/// Get the file extension of a website path +pub fn path_mime(path: &str) -> Option { + site_path_ext(path).and_then(|ext| mime_guess::from_ext(ext).first()) +} + +pub fn full_url_from_request(request: &Request) -> String { + let uri = request.uri(); + if let Some(host) = host_from_request(request) { + format!("{}{}", host, uri.path()) + } else { + uri.to_string() + } +} + +fn host_from_request(request: &Request) -> Option<&str> { + parse_forwarded(request.headers()) + .or_else(|| { + request + .headers() + .get("X-Forwarded-Host") + .and_then(|host| host.to_str().ok()) + }) + .or_else(|| { + request + .headers() + .get(http::header::HOST) + .and_then(|host| host.to_str().ok()) + }) +} + +fn parse_forwarded(headers: &HeaderMap) -> Option<&str> { + // if there are multiple `Forwarded` `HeaderMap::get` will return the first one + let forwarded_values = headers.get(header::FORWARDED)?.to_str().ok()?; + + // get the first set of values + let first_value = forwarded_values.split(',').next()?; + + // find the value of the `host` field + first_value.split(';').find_map(|pair| { + let (key, value) = pair.split_once('=')?; + key.trim() + .eq_ignore_ascii_case("host") + .then(|| value.trim().trim_matches('"')) + }) +} + +pub fn get_subdomain<'a>(host: &'a str, root_domain: &str) -> Result<&'a str> { + let stripped = host.strip_suffix(root_domain).ok_or(Error::BadRequest( + "host does not end with configured ROOT_DOMAIN".into(), + ))?; + Ok(stripped.trim_end_matches('.')) +} + +#[cfg(test)] +mod tests { + use http::{header, HeaderMap}; + use rstest::rstest; + + #[rstest] + #[case("", false)] + #[case("br", false)] + #[case("gzip", true)] + #[case("GZIP", true)] + #[case("*", true)] + #[case("deflate, gzip;q=1.0, *;q=0.5", true)] + fn accepts_gzip(#[case] val: &str, #[case] expect: bool) { + let mut hdrs = HeaderMap::new(); + hdrs.insert(header::ACCEPT_ENCODING, val.try_into().unwrap()); + + assert_eq!(super::accepts_gzip(&hdrs), expect); + } + + #[rstest] + #[case("localhost", Some(""))] + #[case("test.localhost", Some("test"))] + #[case("example.com", None)] + fn get_subdomain(#[case] host: &str, #[case] expect: Option<&str>) { + assert_eq!(super::get_subdomain(host, "localhost").ok(), expect); + } +} diff --git a/templates/error.hbs b/templates/error.hbs new file mode 100644 index 0000000..63f7d40 --- /dev/null +++ b/templates/error.hbs @@ -0,0 +1,40 @@ + + + + + + + Artifactview + + +

+ +

Error {{status}}

+

{{msg}}

+
+ + diff --git a/templates/index.hbs b/templates/index.hbs new file mode 100644 index 0000000..8cbf827 --- /dev/null +++ b/templates/index.hbs @@ -0,0 +1,98 @@ + + + + + + + Artifactview + + +
+ +

Enter a GitHub/Gitea/Forgejo Actions run url to browse CI artifacts

+
+ + +
+ +
+ + diff --git a/templates/listing.hbs b/templates/listing.hbs new file mode 100644 index 0000000..85b9f10 --- /dev/null +++ b/templates/listing.hbs @@ -0,0 +1,82 @@ + + + + + + + +Index of {{artifact_name}} + + + + + + +
+ + + +

+ {{#each path_components}}{{this.name}} /{{/each}} +

+
+ +
+
+
+{{n_dirs}} directories +{{n_files}} files + +
+
+
+ + + + + + + + +{{#if has_parent}} + + + + + + +{{/if}} +{{#each entries}} + + + + + +{{/each}} + +
Name  ↓ Size  ↓ CRC32
Parent directory
+ + + {{this.name}} + + {{#if this.is_dir}}—{{else}}{{this.size}}{{/if}}{{#if this.is_dir}}—{{else}}{{this.crc32}}{{/if}}
+
+
+ + + + + + diff --git a/templates/selection.hbs b/templates/selection.hbs new file mode 100644 index 0000000..be1b878 --- /dev/null +++ b/templates/selection.hbs @@ -0,0 +1,49 @@ + + + + + + + Artifactview + + +
+ + + +

CI artifacts for {{run_name}}:

+ {{#each artifacts}} + {{this.name}} + {{/each}} +
+ +