Spaces:

assafvayner
/

xet-object-viewer

Running

App Files Files Community

xet-object-viewer / src /lib /parsers.ts

assafvayner HF Staff

fix hashes

0d1e1be 3 months ago

raw

history blame

10.6 kB

	// Binary parsers for xorb and shard files

	import type {
	ParsedFileMetadata,
	Chunk,
	ChunkHeader,
	ShardData,
	MerkleHash,
	MDBShardFileHeader,
	MDBShardFileFooter,
	FileDataSequenceHeader,
	FileDataSequenceEntry,
	FileVerificationEntry,
	FileMetadataExt,
	CASChunkSequenceHeader,
	CASChunkSequenceEntry,
	MDBFileInfo,
	MDBCASInfo,
	} from "./types.js";
	import { MDB_SHARD_HEADER_TAG, XORB_IDENT } from "./types.js";

	export class BinaryReader {
	private data: Uint8Array;
	private offset: number = 0;

	constructor(data: Uint8Array) {
	this.data = data;
	}

	readUint8(): number {
	if (this.offset >= this.data.length) {
	console.trace();
	throw new Error("Unexpected end of data");
	}
	return this.data[this.offset++];
	}

	readUint32LE(): number {
	if (this.offset + 4 > this.data.length) {
	console.trace();
	throw new Error("Unexpected end of data");
	}
	const result = new DataView(this.data.buffer).getUint32(this.offset, true);
	this.offset += 4;
	return result;
	}

	readUint64LE(): bigint {
	if (this.offset + 8 > this.data.length) {
	console.trace();
	throw new Error("Unexpected end of data");
	}
	const result = new DataView(this.data.buffer).getBigUint64(
	this.offset,
	true
	);
	this.offset += 8;
	return result;
	}

	readBytes(length: number): Uint8Array {
	if (this.offset + length > this.data.length) {
	console.trace();
	throw new Error("Unexpected end of data");
	}
	const result = this.data.slice(this.offset, this.offset + length);
	this.offset += length;
	return result;
	}

	readHash(): MerkleHash {
	const data = this.readBytes(32);
	// const u64_0 = this.readUint64LE();
	// const u64_1 = this.readUint64LE();
	// const u64_2 = this.readUint64LE();
	// const u64_3 = this.readUint64LE();
	return { data };
	// return { data: [u64_0, u64_1, u64_2, u64_3] };
	}

	readString(length: number): string {
	const bytes = this.readBytes(length);
	return new TextDecoder().decode(bytes);
	}

	seek(position: number): void {
	this.offset = position;
	}

	seekFromEnd(offsetFromEnd: number): void {
	this.offset = this.data.length - offsetFromEnd;
	}

	get position(): number {
	return this.offset;
	}

	get remaining(): number {
	return this.data.length - this.offset;
	}
	}

	function arraysEqual(a: Uint8Array, b: Uint8Array): boolean {
	if (a.length !== b.length) return false;
	for (let i = 0; i < a.length; i++) {
	if (a[i] !== b[i]) return false;
	}
	return true;
	}

	function isBookendHash(hash: MerkleHash): boolean {
	// Bookend hash is all 0xFF bytes (all 64-bit values should be 0xFFFFFFFFFFFFFFFF)
	return hash.data.every((value) => value === 0xff);
	// return hash.data.every((value) => value === 0xffffffffffffffffn);
	}

	export function formatHash(hash: MerkleHash): string {
	// For every 8 bytes in hash.data, flip the order of the 8 bytes,
	// concatenate each new flipped slice, then convert the result array to a hexadecimal string.
	if (!hash \|\| !hash.data \|\| hash.data.length !== 32) return "";
	const flipped = new Uint8Array(32);
	for (let i = 0; i < 4; i++) {
	const start = i * 8;
	for (let j = 0; j < 8; j++) {
	flipped[start + j] = hash.data[start + (7 - j)];
	}
	}
	// Convert to hex string
	return Array.from(flipped)
	.map((b) => b.toString(16).padStart(2, "0"))
	.join("");

	return "";
	}

	// File type detection removed - type is now specified by user selection

	function parseXorbFile(data: Uint8Array): Chunk[] {
	const reader = new BinaryReader(data);

	const chunks: Chunk[] = [];

	while (reader.remaining > 0) {
	// Check if we have enough bytes for a header
	if (reader.remaining < 8) {
	console.error("Unexpected end of data parsing xorb file");
	break;
	}

	const header_bytes = reader.readBytes(8);
	let is_xorb_ident = true;
	// Urgh how do I compare two Uint8Arrays?
	for (let i = 0; i < 7; i++) {
	if (header_bytes[i] !== XORB_IDENT[i]) {
	is_xorb_ident = false;
	break;
	}
	}
	if (is_xorb_ident) {
	// reached optional xorb footer, skip rest
	break;
	}

	const header = new DataView(header_bytes.buffer);

	const version = header.getUint8(0);
	const compressed_size =
	header.getUint8(1) \|
	(header.getUint8(2) << 8) \|
	(header.getUint8(3) << 16);
	const compression_type = header.getUint8(4);
	const uncompressed_size =
	header.getUint8(5) \|
	(header.getUint8(6) << 8) \|
	(header.getUint8(7) << 16);

	const chunkHeader: ChunkHeader = {
	version,
	compressed_size,
	compression_type,
	uncompressed_size,
	};

	const compressed_data = reader.readBytes(compressed_size);

	chunks.push({ header: chunkHeader, compressed_data });
	}

	return chunks;
	}

	function parseShardFile(data: Uint8Array): ShardData {
	const reader = new BinaryReader(data);

	// Parse header
	const tag = reader.readBytes(32);
	if (!arraysEqual(tag, MDB_SHARD_HEADER_TAG)) {
	throw new Error("Invalid shard file header tag");
	}

	const header: MDBShardFileHeader = {
	tag,
	version: Number(reader.readUint64LE()),
	footer_size: Number(reader.readUint64LE()),
	};

	if (header.version !== 2) {
	throw new Error(`Unsupported shard header version: ${header.version}`);
	}

	// Parse footer (from end of file)
	reader.seekFromEnd(header.footer_size);
	const version = Number(reader.readUint64LE());
	const file_info_offset = Number(reader.readUint64LE());
	const cas_info_offset = Number(reader.readUint64LE());

	// Skip first buffer (48 bytes)
	reader.readBytes(48);

	const chunk_hash_hmac_key = reader.readHash();
	const shard_creation_timestamp = Number(reader.readUint64LE());
	const shard_key_expiry = Number(reader.readUint64LE());

	// Skip second buffer (72 bytes)
	reader.readBytes(72);

	const footer_offset = Number(reader.readUint64LE());

	const footer: MDBShardFileFooter = {
	version,
	file_info_offset,
	cas_info_offset,
	chunk_hash_hmac_key,
	shard_creation_timestamp,
	shard_key_expiry,
	footer_offset,
	};

	if (footer.version !== 1) {
	throw new Error(`Unsupported shard footer version: ${footer.version}`);
	}

	// Parse file info section
	const file_info: MDBFileInfo[] = [];
	reader.seek(footer.file_info_offset);

	while (reader.position < footer.cas_info_offset) {
	const pos = reader.position;
	const file_hash = reader.readHash();

	// Check for bookend
	if (isBookendHash(file_hash)) {
	reader.readBytes(16); // unused
	break;
	}

	const file_flags = reader.readUint32LE();
	const num_entries = reader.readUint32LE();
	const _unused = reader.readBytes(8);

	const header: FileDataSequenceHeader = {
	file_hash,
	file_flags,
	num_entries,
	_unused,
	};

	// Read entries
	const entries: FileDataSequenceEntry[] = [];
	for (let i = 0; i < num_entries; i++) {
	const cas_hash = reader.readHash();
	const cas_flags = reader.readUint32LE();
	const unpacked_segment_bytes = reader.readUint32LE();
	const chunk_index_start = reader.readUint32LE();
	const chunk_index_end = reader.readUint32LE();
	entries.push({
	cas_hash,
	cas_flags,
	unpacked_segment_bytes,
	chunk_index_start,
	chunk_index_end,
	});
	}

	// Read verification entries if present
	let verification_entries: FileVerificationEntry[] \| undefined;
	if (file_flags & 0x80000000) {
	verification_entries = [];
	for (let i = 0; i < num_entries; i++) {
	verification_entries.push({
	chunk_hash: reader.readHash(),
	_unused: reader.readBytes(16),
	});
	}
	}

	// Read metadata extension if present
	let metadata_ext: FileMetadataExt \| undefined;
	if (file_flags & 0x40000000) {
	metadata_ext = {
	sha256: reader.readHash(),
	_unused: reader.readBytes(16),
	};
	}

	file_info.push({
	header,
	entries,
	verification_entries,
	metadata_ext,
	});
	}

	// Parse CAS info section
	const cas_info: MDBCASInfo[] = [];
	reader.seek(footer.cas_info_offset);

	while (reader.position < footer.footer_offset) {
	const cas_hash = reader.readHash();

	// Check for bookend
	if (isBookendHash(cas_hash)) {
	break;
	}

	const cas_flags = reader.readUint32LE();
	const num_entries = reader.readUint32LE();
	const num_bytes_in_cas = reader.readUint32LE();
	const num_bytes_on_disk = reader.readUint32LE();

	const header: CASChunkSequenceHeader = {
	cas_hash,
	cas_flags,
	num_entries,
	num_bytes_in_cas,
	num_bytes_on_disk,
	};

	// Read entries
	const entries: CASChunkSequenceEntry[] = [];
	for (let i = 0; i < num_entries; i++) {
	entries.push({
	chunk_hash: reader.readHash(),
	chunk_byte_range_start: reader.readUint32LE(),
	unpacked_segment_bytes: reader.readUint32LE(),
	_unused: Number(reader.readUint64LE()),
	});
	}

	cas_info.push({
	header,
	entries,
	});
	}

	return {
	header,
	footer,
	file_info,
	cas_info,
	};
	}

	export async function parseFile(
	file: File,
	fileType: "xorb" \| "shard"
	): Promise<ParsedFileMetadata> {
	try {
	const arrayBuffer = await file.arrayBuffer();
	const data = new Uint8Array(arrayBuffer);

	let parsedData: Chunk[] \| ShardData;

	if (fileType === "xorb") {
	parsedData = parseXorbFile(data);
	} else {
	parsedData = parseShardFile(data);
	}

	return {
	type: fileType,
	filename: file.name,
	fileSize: file.size,
	data: parsedData,
	};
	} catch (error) {
	return {
	type: fileType,
	filename: file.name,
	fileSize: file.size,
	data: [] as any,
	error: error instanceof Error ? error.message : "Unknown error occurred",
	};
	}
	}

	// Helper functions for displaying data
	export function formatBytes(bytes: number): string {
	if (bytes === 0) return "0 B";
	const k = 1000;
	const sizes = ["B", "KB", "MB", "GB"];
	const i = Math.floor(Math.log(bytes) / Math.log(k));
	return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + " " + sizes[i];
	}

	export function formatTimestamp(timestamp: number): string {
	return new Date(timestamp * 1000).toISOString();
	}

	export function formatHashShort(hash: MerkleHash): string {
	const fullHash = formatHash(hash);
	return fullHash.substring(0, 16) + "...";
	}