概述
上一个项目专注于确定性文本分析;现在我们将这些工件和周围的诊断打包到可重现的存档管道中。53 我们将编写一个极简主义的ZIP创建器,将文件流式传输到内存,发出中央目录,然后在报告增量进度时验证提取。该程序依赖于标准库的ZIP读取器、手动头部编码、用于CRC32检查的StringHashMap记帐,以及通过std.Progress进行结构化状态更新。zip.zighash_map.zigcrc.zigProgress.zig
学习目标
- 通过按正确顺序编写本地文件头、中央目录和中央目录结束记录来从头组装ZIP档案,同时遵守大小和偏移约束。
- 在包旁边捕获确定性完整性指标(CRC32、SHA-256),以便持续集成可以在每次运行时验证结构和内容。crypto.zig
- 通过禁用动画渲染器并使用
std.Progress发出纯文本检查点来显示分析器友好的进度消息。
设计管道
工作流程分为三个阶段:种子示例文件、构建存档以及提取和验证。每个阶段增加根进度节点,产生确定性控制台摘要,这些摘要可兼作验收标准。所有文件系统操作都在std.testing.tmpDir管理的临时目录下进行,保持实际工作空间清洁。47testing.zig
对于存档元数据,我们在编写头部和稍后验证提取的文件时重用相同的相对路径。在StringHashMap中存储每个路径的CRC32和字节数,使我们能够在提取后以直接的方式将预期与实际输出进行差异比较。
存档组装
因为Zig 0.15.2附带ZIP读取器而不是写入器,我们使用ArrayList(u8)在内存中构建存档,依次附加每个组件:本地文件头、文件名、文件字节。每个头部字段都使用显式的小端辅助函数写入,以便结果可在不同架构间移植。一旦有效负载进入blob,我们附加中央目录(每个文件一个记录),然后是中央目录结束记录,镜像PKWARE APPNOTE中定义的结构并在std.zip中编码。array_list.zigfmt.zig
在编写头部时,我们确保大小和偏移适合32位字段(坚持经典ZIP子集)并将文件名复制一次到映射中,以便稍后可以确定性地释放资源。存档镜像完成后,我们将其持久化到磁盘并计算SHA-256摘要以进行下游回归——摘要使用std.fmt.bytesToHex渲染,因此可以在内联比较而无需任何额外的工具。
提取和验证
提取重用标准库迭代器,它遍历每个中央目录记录并将数据流交给std.zip.Entry.extract;我们通过std.zip.Diagnostics规范化根文件夹名称,以便可以将其暴露给调用者。每个文件落地到磁盘后,我们再次计算CRC32并将字节数与记录的预期进行比较。任何不匹配都会立即使程序失败,使其安全地嵌入CI管道或部署钩子中。
std.Progress节点驱动控制台输出:根节点跟踪三个高级阶段,而子节点在种子、构建和验证期间计算文件列表。因为打印被禁用,最终消息是普通文本行(通过缓冲的stdout写入器渲染),可以在自动化测试中逐字差异。47
端到端实现
const std = @import("std");
const SampleFile = struct {
path: []const u8,
contents: []const u8,
};
const sample_files = [_]SampleFile{
.{ .path = "input/metrics.txt", .contents = "uptime=420s\nrequests=1312\nerrors=3\n" },
.{ .path = "input/inventory.json", .contents = "{\n \"service\": \"telemetry\",\n \"shards\": [\"alpha\", \"beta\", \"gamma\"]\n}\n" },
.{ .path = "input/logs/app.log", .contents = "[info] ingest started\n[warn] queue delay=87ms\n[info] ingest completed\n" },
.{ .path = "input/README.md", .contents = "# Telemetry bundle\n\nSynthetic records used for the zip/unzip progress demo.\n" },
};
const EntryMetrics = struct {
crc32: u32,
size: usize,
};
const BuildSummary = struct {
bytes_written: usize,
sha256: [32]u8,
};
const VerifySummary = struct {
files_checked: usize,
total_bytes: usize,
extracted_root: []const u8,
owns_root: bool,
};
const archive_path = "artifact/telemetry.zip";
const extract_root = "replay";
fn seedSamples(dir: std.fs.Dir, progress: *std.Progress.Node) !struct { files: usize, bytes: usize } {
var total_bytes: usize = 0;
for (sample_files) |sample| {
if (std.fs.path.dirname(sample.path)) |parent| {
try dir.makePath(parent);
}
var file = try dir.createFile(sample.path, .{ .truncate = true });
defer file.close();
try file.writeAll(sample.contents);
total_bytes += sample.contents.len;
progress.completeOne();
}
return .{ .files = sample_files.len, .bytes = total_bytes };
}
const EntryRecord = struct {
name: []const u8,
crc32: u32,
size: u32,
offset: u32,
};
fn makeLocalHeader(name_len: u16, crc32: u32, size: u32) [30]u8 {
var header: [30]u8 = undefined;
header[0] = 'P';
header[1] = 'K';
header[2] = 3;
header[3] = 4;
std.mem.writeInt(u16, header[4..6], 20, .little);
std.mem.writeInt(u16, header[6..8], 0, .little);
std.mem.writeInt(u16, header[8..10], 0, .little);
std.mem.writeInt(u16, header[10..12], 0, .little);
std.mem.writeInt(u16, header[12..14], 0, .little);
std.mem.writeInt(u32, header[14..18], crc32, .little);
std.mem.writeInt(u32, header[18..22], size, .little);
std.mem.writeInt(u32, header[22..26], size, .little);
std.mem.writeInt(u16, header[26..28], name_len, .little);
std.mem.writeInt(u16, header[28..30], 0, .little);
return header;
}
fn makeCentralHeader(entry: EntryRecord) [46]u8 {
var header: [46]u8 = undefined;
header[0] = 'P';
header[1] = 'K';
header[2] = 1;
header[3] = 2;
std.mem.writeInt(u16, header[4..6], 0x0314, .little);
std.mem.writeInt(u16, header[6..8], 20, .little);
std.mem.writeInt(u16, header[8..10], 0, .little);
std.mem.writeInt(u16, header[10..12], 0, .little);
std.mem.writeInt(u16, header[12..14], 0, .little);
std.mem.writeInt(u16, header[14..16], 0, .little);
std.mem.writeInt(u32, header[16..20], entry.crc32, .little);
std.mem.writeInt(u32, header[20..24], entry.size, .little);
std.mem.writeInt(u32, header[24..28], entry.size, .little);
const name_len_u16 = @as(u16, @intCast(entry.name.len));
std.mem.writeInt(u16, header[28..30], name_len_u16, .little);
std.mem.writeInt(u16, header[30..32], 0, .little);
std.mem.writeInt(u16, header[32..34], 0, .little);
std.mem.writeInt(u16, header[34..36], 0, .little);
std.mem.writeInt(u16, header[36..38], 0, .little);
const unix_mode: u32 = 0o100644 << 16;
std.mem.writeInt(u32, header[38..42], unix_mode, .little);
std.mem.writeInt(u32, header[42..46], entry.offset, .little);
return header;
}
fn makeEndRecord(cd_size: u32, cd_offset: u32, entry_count: u16) [22]u8 {
var footer: [22]u8 = undefined;
footer[0] = 'P';
footer[1] = 'K';
footer[2] = 5;
footer[3] = 6;
std.mem.writeInt(u16, footer[4..6], 0, .little);
std.mem.writeInt(u16, footer[6..8], 0, .little);
std.mem.writeInt(u16, footer[8..10], entry_count, .little);
std.mem.writeInt(u16, footer[10..12], entry_count, .little);
std.mem.writeInt(u32, footer[12..16], cd_size, .little);
std.mem.writeInt(u32, footer[16..20], cd_offset, .little);
std.mem.writeInt(u16, footer[20..22], 0, .little);
return footer;
}
fn buildArchive(
allocator: std.mem.Allocator,
dir: std.fs.Dir,
metrics: *std.StringHashMap(EntryMetrics),
progress: *std.Progress.Node,
) !BuildSummary {
if (std.fs.path.dirname(archive_path)) |parent| {
try dir.makePath(parent);
}
var entries = try std.ArrayList(EntryRecord).initCapacity(allocator, sample_files.len);
defer entries.deinit(allocator);
try metrics.ensureTotalCapacity(sample_files.len);
var blob: std.ArrayList(u8) = .empty;
defer blob.deinit(allocator);
for (sample_files) |sample| {
if (sample.path.len > std.math.maxInt(u16)) return error.NameTooLong;
var file = try dir.openFile(sample.path, .{});
defer file.close();
const max_len = 64 * 1024;
const data = try file.readToEndAlloc(allocator, max_len);
defer allocator.free(data);
if (data.len > std.math.maxInt(u32)) return error.InputTooLarge;
if (blob.items.len > std.math.maxInt(u32)) return error.ArchiveTooLarge;
var crc = std.hash.crc.Crc32.init();
crc.update(data);
const digest = crc.final();
const offset_u32 = @as(u32, @intCast(blob.items.len));
const size_u32 = @as(u32, @intCast(data.len));
const name_len_u16 = @as(u16, @intCast(sample.path.len));
const header = makeLocalHeader(name_len_u16, digest, size_u32);
try blob.appendSlice(allocator, header[0..]);
try blob.appendSlice(allocator, sample.path);
try blob.appendSlice(allocator, data);
try entries.append(allocator, .{
.name = sample.path,
.crc32 = digest,
.size = size_u32,
.offset = offset_u32,
});
const gop = try metrics.getOrPut(sample.path);
if (!gop.found_existing) {
gop.key_ptr.* = try allocator.dupe(u8, sample.path);
}
gop.value_ptr.* = .{ .crc32 = digest, .size = data.len };
progress.completeOne();
}
const central_offset_usize = blob.items.len;
if (central_offset_usize > std.math.maxInt(u32)) return error.ArchiveTooLarge;
const central_offset = @as(u32, @intCast(central_offset_usize));
for (entries.items) |entry| {
const header = makeCentralHeader(entry);
try blob.appendSlice(allocator, header[0..]);
try blob.appendSlice(allocator, entry.name);
}
const central_size = @as(u32, @intCast(blob.items.len - central_offset_usize));
const footer = makeEndRecord(central_size, central_offset, @as(u16, @intCast(entries.items.len)));
try blob.appendSlice(allocator, footer[0..]);
var zip_file = try dir.createFile(archive_path, .{ .truncate = true, .read = true });
defer zip_file.close();
try zip_file.writeAll(blob.items);
var sha256 = std.crypto.hash.sha2.Sha256.init(.{});
sha256.update(blob.items);
var digest_bytes: [32]u8 = undefined;
sha256.final(&digest_bytes);
return .{ .bytes_written = blob.items.len, .sha256 = digest_bytes };
}
fn extractAndVerify(
allocator: std.mem.Allocator,
dir: std.fs.Dir,
metrics: *const std.StringHashMap(EntryMetrics),
progress: *std.Progress.Node,
) !VerifySummary {
try dir.makePath(extract_root);
var dest_dir = try dir.openDir(extract_root, .{ .access_sub_paths = true, .iterate = true });
defer dest_dir.close();
var file = try dir.openFile(archive_path, .{});
defer file.close();
var read_buf: [4096]u8 = undefined;
var reader = file.reader(&read_buf);
var diagnostics = std.zip.Diagnostics{ .allocator = allocator };
defer diagnostics.deinit();
try std.zip.extract(dest_dir, &reader, .{ .diagnostics = &diagnostics });
var files_checked: usize = 0;
var total_bytes: usize = 0;
for (sample_files) |sample| {
var out_file = try dest_dir.openFile(sample.path, .{});
defer out_file.close();
const data = try out_file.readToEndAlloc(allocator, 64 * 1024);
defer allocator.free(data);
const expected = metrics.get(sample.path) orelse return error.ExpectedEntryMissing;
var crc = std.hash.crc.Crc32.init();
crc.update(data);
if (crc.final() != expected.crc32 or data.len != expected.size) {
return error.VerificationFailed;
}
files_checked += 1;
total_bytes += data.len;
progress.completeOne();
}
var result_root: []const u8 = "<scattered>";
var owns_root = false;
if (diagnostics.root_dir.len > 0) {
result_root = try allocator.dupe(u8, diagnostics.root_dir);
owns_root = true;
}
return .{
.files_checked = files_checked,
.total_bytes = total_bytes,
.extracted_root = result_root,
.owns_root = owns_root,
};
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer {
const leak_status = gpa.deinit();
std.debug.assert(leak_status == .ok);
}
const allocator = gpa.allocator();
var stdout_buffer: [512]u8 = undefined;
var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
const out = &stdout_writer.interface;
var tmp = std.testing.tmpDir(.{});
defer tmp.cleanup();
var metrics = std.StringHashMap(EntryMetrics).init(allocator);
defer {
var it = metrics.iterator();
while (it.next()) |kv| {
allocator.free(kv.key_ptr.*);
}
metrics.deinit();
}
var progress_root = std.Progress.start(.{
.root_name = "zip-pipeline",
.estimated_total_items = 3,
.disable_printing = true,
});
defer progress_root.end();
var stage_seed = progress_root.start("seed", sample_files.len);
const seeded = try seedSamples(tmp.dir, &stage_seed);
stage_seed.end();
try out.print("[1/3] seeded samples -> files={d}, bytes={d}\n", .{ seeded.files, seeded.bytes });
var stage_build = progress_root.start("build", sample_files.len);
const build_summary = try buildArchive(allocator, tmp.dir, &metrics, &stage_build);
stage_build.end();
const hex_digest = std.fmt.bytesToHex(build_summary.sha256, .lower);
try out.print("[2/3] built archive -> bytes={d}\n sha256={s}\n", .{ build_summary.bytes_written, hex_digest[0..] });
var stage_verify = progress_root.start("verify", sample_files.len);
const verify_summary = try extractAndVerify(allocator, tmp.dir, &metrics, &stage_verify);
stage_verify.end();
defer if (verify_summary.owns_root) allocator.free(verify_summary.extracted_root);
try out.print(
"[3/3] extracted + verified -> files={d}, bytes={d}, root={s}\n",
.{ verify_summary.files_checked, verify_summary.total_bytes, verify_summary.extracted_root },
);
try out.flush();
}
$ zig run zip_progress_pipeline.zig[1/3] seeded samples -> files=4, bytes=250
[2/3] built archive -> bytes=716
sha256=4a13a3dc1e6ef90c252b0cc797ff14456aa28c670cafbc9d27a025b0079b05d5
[3/3] extracted + verified -> files=4, bytes=250, root=input验证步骤有意地复制提取的根字符串,当诊断发现公共前缀时;摘要随后释放该缓冲区以保持通用分配器清洁。这反映了通过临时目录流式传输大型存档的CLI工具的良好卫生习惯。52