Creating comprehensive test data sets is crucial for validating file processing systems. In 2025, modern approaches to test data generation enable us to create realistic, diverse, and secure datasets that thoroughly exercise our applications.
Why Test Data Generation Matters
Traditional manual test data creation is time-consuming and often incomplete. Modern file processing systems need to handle:
- Diverse file formats - Images, videos, documents, archives
- Varying file sizes - From bytes to gigabytes
- Edge cases - Corrupted files, unusual metadata, malformed content
- Security scenarios - Malicious files, injection attempts
- Performance testing - Large datasets, concurrent uploads
Synthetic File Generation Strategies
1. Programmatic Image Generation
// Modern Canvas API for synthetic image generation
class SyntheticImageGenerator {
constructor() {
this.canvas = new OffscreenCanvas(1920, 1080);
this.ctx = this.canvas.getContext("2d");
}
generateTestImage(config = {}) {
const { width = 1920, height = 1080, format = "webp", quality = 0.8, pattern = "gradient" } = config;
this.canvas.width = width;
this.canvas.height = height;
switch (pattern) {
case "gradient":
this.drawGradient();
break;
case "noise":
this.drawNoise();
break;
case "checkerboard":
this.drawCheckerboard();
break;
case "text":
this.drawTextPattern();
break;
}
return this.canvas.convertToBlob({
type: `image/${format}`,
quality,
});
}
drawGradient() {
const gradient = this.ctx.createLinearGradient(0, 0, this.canvas.width, this.canvas.height);
gradient.addColorStop(0, "#ff6b6b");
gradient.addColorStop(0.5, "#4ecdc4");
gradient.addColorStop(1, "#45b7d1");
this.ctx.fillStyle = gradient;
this.ctx.fillRect(0, 0, this.canvas.width, this.canvas.height);
}
drawNoise() {
const imageData = this.ctx.createImageData(this.canvas.width, this.canvas.height);
const data = imageData.data;
for (let i = 0; i < data.length; i += 4) {
const noise = Math.random() * 255;
data[i] = noise; // Red
data[i + 1] = noise; // Green
data[i + 2] = noise; // Blue
data[i + 3] = 255; // Alpha
}
this.ctx.putImageData(imageData, 0, 0);
}
drawCheckerboard() {
const squareSize = 50;
for (let x = 0; x < this.canvas.width; x += squareSize) {
for (let y = 0; y < this.canvas.height; y += squareSize) {
const isEven = (x / squareSize + y / squareSize) % 2 === 0;
this.ctx.fillStyle = isEven ? "#000000" : "#ffffff";
this.ctx.fillRect(x, y, squareSize, squareSize);
}
}
}
drawTextPattern() {
this.ctx.fillStyle = "#f0f0f0";
this.ctx.fillRect(0, 0, this.canvas.width, this.canvas.height);
this.ctx.fillStyle = "#333333";
this.ctx.font = "48px Arial";
this.ctx.textAlign = "center";
const text = "TEST IMAGE DATA";
const x = this.canvas.width / 2;
const y = this.canvas.height / 2;
this.ctx.fillText(text, x, y);
// Add metadata text
this.ctx.font = "24px Arial";
this.ctx.fillText(`${this.canvas.width}x${this.canvas.height}`, x, y + 60);
this.ctx.fillText(new Date().toISOString(), x, y + 100);
}
}
// Generate test image suite
async function generateImageTestSuite() {
const generator = new SyntheticImageGenerator();
const testImages = [];
// Standard test images
const configs = [
{ width: 1920, height: 1080, pattern: "gradient", format: "webp" },
{ width: 640, height: 480, pattern: "noise", format: "jpeg" },
{ width: 200, height: 200, pattern: "checkerboard", format: "png" },
{ width: 4096, height: 4096, pattern: "text", format: "png" },
{ width: 1, height: 1, pattern: "gradient", format: "webp" }, // Edge case
{ width: 8192, height: 8192, pattern: "noise", format: "jpeg" }, // Large image
];
for (const config of configs) {
const blob = await generator.generateTestImage(config);
testImages.push({
name: `test_${config.width}x${config.height}_${config.pattern}.${config.format}`,
blob,
metadata: config,
});
}
return testImages;
}
2. Document Generation with Modern Libraries
// PDF generation with jsPDF for testing
import { jsPDF } from "jspdf";
class TestDocumentGenerator {
generatePDF(config = {}) {
const { pages = 1, content = "sample", size = "A4", orientation = "portrait" } = config;
const doc = new jsPDF({
orientation,
unit: "mm",
format: size,
});
for (let i = 1; i <= pages; i++) {
if (i > 1) doc.addPage();
switch (content) {
case "text":
this.addTextContent(doc, i, pages);
break;
case "images":
this.addImageContent(doc, i);
break;
case "tables":
this.addTableContent(doc, i);
break;
case "mixed":
this.addMixedContent(doc, i);
break;
default:
this.addSampleContent(doc, i);
}
}
return doc.output("blob");
}
addTextContent(doc, pageNum, totalPages) {
doc.setFontSize(16);
doc.text("Test Document - Text Content", 20, 20);
doc.setFontSize(12);
doc.text(`Page ${pageNum} of ${totalPages}`, 20, 30);
// Lorem ipsum content
const loremText =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. " +
"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " +
"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.";
const lines = doc.splitTextToSize(loremText.repeat(10), 170);
doc.text(lines, 20, 40);
}
addImageContent(doc, pageNum) {
doc.setFontSize(16);
doc.text("Test Document - Image Content", 20, 20);
// Add placeholder rectangles for images
doc.setFillColor(200, 200, 200);
doc.rect(20, 30, 80, 60, "F");
doc.rect(110, 30, 80, 60, "F");
doc.setFontSize(10);
doc.text("Image Placeholder 1", 25, 65);
doc.text("Image Placeholder 2", 115, 65);
}
addTableContent(doc, pageNum) {
doc.setFontSize(16);
doc.text("Test Document - Table Content", 20, 20);
// Simple table
const tableData = [
["Name", "Value", "Type"],
["File Size", "1.2 MB", "Number"],
["Upload Date", "2025-01-15", "Date"],
["Status", "Processed", "String"],
];
let y = 40;
tableData.forEach((row, index) => {
if (index === 0) {
doc.setFontSize(12);
doc.setFont(undefined, "bold");
} else {
doc.setFontSize(10);
doc.setFont(undefined, "normal");
}
doc.text(row[0], 20, y);
doc.text(row[1], 70, y);
doc.text(row[2], 120, y);
y += 10;
});
}
addMixedContent(doc, pageNum) {
this.addTextContent(doc, pageNum, 1);
// Add some geometric shapes
doc.setDrawColor(100);
doc.circle(150, 150, 20);
doc.triangle(170, 180, 190, 180, 180, 160);
}
addSampleContent(doc, pageNum) {
doc.text("Sample Test Document", 20, 20);
doc.text(`Generated on: ${new Date().toISOString()}`, 20, 30);
doc.text(`Page: ${pageNum}`, 20, 40);
}
}
// Generate document test suite
async function generateDocumentTestSuite() {
const generator = new TestDocumentGenerator();
const testDocs = [];
const configs = [
{ pages: 1, content: "text", size: "A4" },
{ pages: 5, content: "mixed", size: "A4" },
{ pages: 1, content: "images", size: "Letter" },
{ pages: 10, content: "text", size: "A3" }, // Large document
{ pages: 1, content: "tables", size: "A4" },
];
for (const config of configs) {
const blob = generator.generatePDF(config);
testDocs.push({
name: `test_doc_${config.pages}pages_${config.content}.pdf`,
blob,
metadata: config,
});
}
return testDocs;
}
Edge Case and Security Test Data
1. Malformed File Generation
class MalformedFileGenerator {
generateCorruptedImage() {
// Create a valid PNG header but corrupt the data
const validPNGHeader = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
const corruptedData = new Uint8Array(1024);
// Fill with random data
crypto.getRandomValues(corruptedData);
// Combine valid header with corrupted data
const combined = new Uint8Array(validPNGHeader.length + corruptedData.length);
combined.set(validPNGHeader);
combined.set(corruptedData, validPNGHeader.length);
return new Blob([combined], { type: "image/png" });
}
generateOversizedMetadata() {
// Create a minimal JPEG with massive EXIF data
const jpegHeader = new Uint8Array([0xff, 0xd8, 0xff, 0xe1]);
const oversizedExif = new Uint8Array(100000); // 100KB of EXIF data
oversizedExif.fill(0x41); // Fill with 'A' characters
const jpegEnd = new Uint8Array([0xff, 0xd9]);
const combined = new Uint8Array(jpegHeader.length + oversizedExif.length + jpegEnd.length);
combined.set(jpegHeader);
combined.set(oversizedExif, jpegHeader.length);
combined.set(jpegEnd, jpegHeader.length + oversizedExif.length);
return new Blob([combined], { type: "image/jpeg" });
}
generateZipBomb() {
// Create a small ZIP that expands to a large size
// Note: This is for testing purposes only
const compressedData = "50 4B 03 04 14 00 00 00 08 00"; // ZIP header
// This would need proper ZIP implementation for real zip bomb
return new Blob([new Uint8Array([0x50, 0x4b, 0x03, 0x04])], {
type: "application/zip",
});
}
generateFileWithNullBytes() {
const content = "Normal content\x00\x00\x00More content";
return new Blob([content], { type: "text/plain" });
}
generateFileWithLongFilename() {
const longName = "a".repeat(300) + ".txt";
const content = "File with extremely long filename for testing";
return new Blob([content], { type: "text/plain" });
}
}
2. Performance Test Data Generation
class PerformanceTestDataGenerator {
async generateLargeFileSet(totalSizeMB = 100, fileCount = 10) {
const files = [];
const avgFileSizeBytes = (totalSizeMB * 1024 * 1024) / fileCount;
for (let i = 0; i < fileCount; i++) {
// Vary file sizes around the average
const variance = 0.3; // 30% variance
const sizeVariation = (Math.random() - 0.5) * 2 * variance;
const fileSize = Math.floor(avgFileSizeBytes * (1 + sizeVariation));
const fileData = await this.generateRandomData(fileSize);
files.push({
name: `perf_test_${i + 1}.bin`,
blob: new Blob([fileData], { type: "application/octet-stream" }),
size: fileSize,
});
}
return files;
}
async generateRandomData(sizeBytes) {
const chunkSize = 1024 * 1024; // 1MB chunks
const chunks = [];
let remaining = sizeBytes;
while (remaining > 0) {
const currentChunkSize = Math.min(chunkSize, remaining);
const chunk = new Uint8Array(currentChunkSize);
crypto.getRandomValues(chunk);
chunks.push(chunk);
remaining -= currentChunkSize;
}
return new Uint8Array(chunks.reduce((acc, chunk) => [...acc, ...chunk], []));
}
generateConcurrentUploadSet(concurrency = 10) {
const files = [];
for (let i = 0; i < concurrency; i++) {
const content = `Concurrent upload test file ${i + 1}\n`.repeat(1000);
files.push({
name: `concurrent_${i + 1}.txt`,
blob: new Blob([content], { type: "text/plain" }),
uploadDelay: Math.random() * 1000, // Random delay 0-1000ms
});
}
return files;
}
}
Data-Driven Test Configuration
Modern Test Data Management
// test-data-config.js
export const testDataSets = {
images: {
valid: [
{ format: "webp", size: "small", quality: "high" },
{ format: "avif", size: "medium", quality: "medium" },
{ format: "jpeg", size: "large", quality: "low" },
],
invalid: [{ type: "corrupted_header" }, { type: "oversized_metadata" }, { type: "unsupported_format" }],
edge_cases: [
{ width: 1, height: 1 },
{ width: 65535, height: 65535 },
{ animated: true, frames: 100 },
],
},
documents: {
valid: [
{ type: "pdf", pages: 1, size: "A4" },
{ type: "docx", pages: 10, tables: true },
{ type: "xlsx", sheets: 3, rows: 1000 },
],
security: [{ type: "password_protected" }, { type: "macro_enabled" }, { type: "external_links" }],
},
performance: {
load_testing: [
{ file_count: 100, total_size_mb: 50 },
{ file_count: 1000, total_size_mb: 100 },
{ file_count: 10, total_size_mb: 500 },
],
stress_testing: [
{ concurrent_uploads: 50 },
{ file_size_gb: 2 },
{ rapid_uploads: { count: 100, interval_ms: 10 } },
],
},
};
// Test data factory
export class TestDataFactory {
constructor() {
this.imageGenerator = new SyntheticImageGenerator();
this.documentGenerator = new TestDocumentGenerator();
this.malformedGenerator = new MalformedFileGenerator();
this.performanceGenerator = new PerformanceTestDataGenerator();
}
async generateFromConfig(configPath) {
const config = testDataSets[configPath.split(".")[0]];
const testSet = config[configPath.split(".")[1]];
const results = [];
for (const testCase of testSet) {
const testData = await this.generateTestCase(testCase);
results.push(testData);
}
return results;
}
async generateTestCase(testCase) {
switch (testCase.type || testCase.format) {
case "webp":
case "avif":
case "jpeg":
return this.generateImageTestCase(testCase);
case "pdf":
return this.generateDocumentTestCase(testCase);
case "corrupted_header":
return this.malformedGenerator.generateCorruptedImage();
default:
throw new Error(`Unknown test case type: ${testCase.type}`);
}
}
async generateImageTestCase(config) {
const sizeMap = { small: 400, medium: 1200, large: 3000 };
const qualityMap = { low: 0.3, medium: 0.7, high: 0.9 };
return this.imageGenerator.generateTestImage({
width: sizeMap[config.size] || 1200,
height: sizeMap[config.size] || 1200,
format: config.format,
quality: qualityMap[config.quality] || 0.8,
});
}
generateDocumentTestCase(config) {
return this.documentGenerator.generatePDF({
pages: config.pages,
size: config.size || "A4",
});
}
}
Advanced Testing Scenarios
1. Automated Test Data Pipeline
// test-data-pipeline.js
export class TestDataPipeline {
constructor() {
this.factory = new TestDataFactory();
this.storage = new Map();
}
async generateTestSuite(scenarios) {
const testSuite = {
metadata: {
generated_at: new Date().toISOString(),
total_files: 0,
total_size_mb: 0,
},
files: [],
};
for (const scenario of scenarios) {
console.log(`Generating test data for: ${scenario.name}`);
const files = await this.factory.generateFromConfig(scenario.config);
for (const file of files) {
const testFile = {
id: crypto.randomUUID(),
name: file.name || `${scenario.name}_${testSuite.files.length}.${scenario.extension}`,
blob: file.blob,
size: file.blob.size,
type: file.blob.type,
scenario: scenario.name,
metadata: file.metadata || {},
};
testSuite.files.push(testFile);
testSuite.metadata.total_files++;
testSuite.metadata.total_size_mb += testFile.size / (1024 * 1024);
}
}
return testSuite;
}
async saveTestSuite(testSuite, outputPath) {
// Save metadata
const manifest = {
metadata: testSuite.metadata,
files: testSuite.files.map((f) => ({
id: f.id,
name: f.name,
size: f.size,
type: f.type,
scenario: f.scenario,
metadata: f.metadata,
})),
};
// In a real implementation, you'd save to filesystem or cloud storage
console.log("Test suite manifest:", manifest);
// Store files for retrieval
for (const file of testSuite.files) {
this.storage.set(file.id, file.blob);
}
}
getTestFile(fileId) {
return this.storage.get(fileId);
}
}
// Usage example
const scenarios = [
{
name: "standard_images",
config: "images.valid",
extension: "webp",
},
{
name: "security_tests",
config: "images.invalid",
extension: "png",
},
{
name: "performance_load",
config: "performance.load_testing",
extension: "bin",
},
];
const pipeline = new TestDataPipeline();
const testSuite = await pipeline.generateTestSuite(scenarios);
await pipeline.saveTestSuite(testSuite, "./test-data");
2. Test Data Validation
// test-data-validator.js
export class TestDataValidator {
validateTestSuite(testSuite) {
const validationReport = {
valid: true,
issues: [],
statistics: {
total_files: testSuite.files.length,
total_size_mb: 0,
formats: {},
scenarios: {},
},
};
for (const file of testSuite.files) {
// Validate file properties
this.validateFile(file, validationReport);
// Update statistics
this.updateStatistics(file, validationReport.statistics);
}
return validationReport;
}
validateFile(file, report) {
// Check required properties
const required = ["id", "name", "blob", "size", "type", "scenario"];
for (const prop of required) {
if (!file[prop]) {
report.issues.push(`File missing required property: ${prop}`);
report.valid = false;
}
}
// Validate file size consistency
if (file.blob && file.size !== file.blob.size) {
report.issues.push(`Size mismatch for ${file.name}: reported ${file.size}, actual ${file.blob.size}`);
report.valid = false;
}
// Validate file name
if (file.name && !this.isValidFileName(file.name)) {
report.issues.push(`Invalid file name: ${file.name}`);
report.valid = false;
}
}
updateStatistics(file, stats) {
stats.total_size_mb += file.size / (1024 * 1024);
// Count formats
const format = file.type || "unknown";
stats.formats[format] = (stats.formats[format] || 0) + 1;
// Count scenarios
stats.scenarios[file.scenario] = (stats.scenarios[file.scenario] || 0) + 1;
}
isValidFileName(filename) {
// Basic filename validation
const invalidChars = /[<>:"/\\|?*\x00-\x1f]/;
return !invalidChars.test(filename) && filename.length <= 255;
}
}
Integration with Modern Testing Frameworks
Vitest Integration
// __tests__/test-data.setup.js
import { beforeAll, afterAll } from "vitest";
import { TestDataPipeline } from "../src/test-utils/test-data-pipeline.js";
let testDataPipeline;
let testSuite;
beforeAll(async () => {
testDataPipeline = new TestDataPipeline();
const scenarios = [
{ name: "unit_test_images", config: "images.valid", extension: "webp" },
{ name: "edge_cases", config: "images.edge_cases", extension: "png" },
];
testSuite = await testDataPipeline.generateTestSuite(scenarios);
// Make test data globally available
globalThis.testData = testSuite;
});
afterAll(() => {
// Cleanup if needed
globalThis.testData = null;
});
// __tests__/file-processing.test.js
import { describe, it, expect } from "vitest";
import { processFile } from "../src/file-processor.js";
describe("File Processing with Generated Test Data", () => {
it("should process standard images correctly", async () => {
const imageFiles = globalThis.testData.files.filter((f) => f.scenario === "unit_test_images");
for (const file of imageFiles) {
const result = await processFile(file.blob);
expect(result.success).toBe(true);
expect(result.format).toBeTruthy();
expect(result.dimensions).toBeDefined();
}
});
it("should handle edge cases gracefully", async () => {
const edgeCaseFiles = globalThis.testData.files.filter((f) => f.scenario === "edge_cases");
for (const file of edgeCaseFiles) {
const result = await processFile(file.blob);
// Edge cases might fail, but should fail gracefully
expect(result).toBeDefined();
if (!result.success) {
expect(result.error).toBeDefined();
expect(typeof result.error).toBe("string");
}
}
});
});
Best Practices for 2025
1. Automated Generation
- Use synthetic data generation instead of static test files
- Generate test data on-demand to reduce repository size
- Create parameterized test data for different scenarios
2. Comprehensive Coverage
- Include edge cases (empty files, huge files, malformed data)
- Test security scenarios (malicious files, injection attempts)
- Generate performance test datasets automatically
3. Modern Formats
- Focus on WebP, AVIF, and other modern image formats
- Test new video codecs (AV1, VP9)
- Include modern document formats and standards
4. Realistic Data
- Generate data that matches real-world file characteristics
- Include metadata patterns found in production
- Simulate user behavior patterns in file uploads
5. Scalable Testing
- Design test data generation to work in CI/CD pipelines
- Use streaming and chunking for large file testing
- Implement parallel test data generation
Creating comprehensive test data sets is essential for robust file processing systems. Modern approaches using synthetic generation, automated pipelines, and integration with testing frameworks ensure thorough validation while maintaining efficiency and security in 2025 development workflows.
