Test Data Generation for File Processing Applications

Creating comprehensive test data sets is crucial for validating file processing systems. In 2025, modern approaches to test data generation enable us to create realistic, diverse, and secure datasets that thoroughly exercise our applications.

Why Test Data Generation Matters

Traditional manual test data creation is time-consuming and often incomplete. Modern file processing systems need to handle:

Diverse file formats - Images, videos, documents, archives
Varying file sizes - From bytes to gigabytes
Edge cases - Corrupted files, unusual metadata, malformed content
Security scenarios - Malicious files, injection attempts
Performance testing - Large datasets, concurrent uploads

Synthetic File Generation Strategies

1. Programmatic Image Generation

// Modern Canvas API for synthetic image generation
class SyntheticImageGenerator {
  constructor() {
    this.canvas = new OffscreenCanvas(1920, 1080);
    this.ctx = this.canvas.getContext("2d");
  }

  generateTestImage(config = {}) {
    const { width = 1920, height = 1080, format = "webp", quality = 0.8, pattern = "gradient" } = config;

    this.canvas.width = width;
    this.canvas.height = height;

    switch (pattern) {
      case "gradient":
        this.drawGradient();
        break;
      case "noise":
        this.drawNoise();
        break;
      case "checkerboard":
        this.drawCheckerboard();
        break;
      case "text":
        this.drawTextPattern();
        break;
    }

    return this.canvas.convertToBlob({
      type: `image/${format}`,
      quality,
    });
  }

  drawGradient() {
    const gradient = this.ctx.createLinearGradient(0, 0, this.canvas.width, this.canvas.height);
    gradient.addColorStop(0, "#ff6b6b");
    gradient.addColorStop(0.5, "#4ecdc4");
    gradient.addColorStop(1, "#45b7d1");

    this.ctx.fillStyle = gradient;
    this.ctx.fillRect(0, 0, this.canvas.width, this.canvas.height);
  }

  drawNoise() {
    const imageData = this.ctx.createImageData(this.canvas.width, this.canvas.height);
    const data = imageData.data;

    for (let i = 0; i < data.length; i += 4) {
      const noise = Math.random() * 255;
      data[i] = noise; // Red
      data[i + 1] = noise; // Green
      data[i + 2] = noise; // Blue
      data[i + 3] = 255; // Alpha
    }

    this.ctx.putImageData(imageData, 0, 0);
  }

  drawCheckerboard() {
    const squareSize = 50;
    for (let x = 0; x < this.canvas.width; x += squareSize) {
      for (let y = 0; y < this.canvas.height; y += squareSize) {
        const isEven = (x / squareSize + y / squareSize) % 2 === 0;
        this.ctx.fillStyle = isEven ? "#000000" : "#ffffff";
        this.ctx.fillRect(x, y, squareSize, squareSize);
      }
    }
  }

  drawTextPattern() {
    this.ctx.fillStyle = "#f0f0f0";
    this.ctx.fillRect(0, 0, this.canvas.width, this.canvas.height);

    this.ctx.fillStyle = "#333333";
    this.ctx.font = "48px Arial";
    this.ctx.textAlign = "center";

    const text = "TEST IMAGE DATA";
    const x = this.canvas.width / 2;
    const y = this.canvas.height / 2;

    this.ctx.fillText(text, x, y);

    // Add metadata text
    this.ctx.font = "24px Arial";
    this.ctx.fillText(`${this.canvas.width}x${this.canvas.height}`, x, y + 60);
    this.ctx.fillText(new Date().toISOString(), x, y + 100);
  }
}

// Generate test image suite
async function generateImageTestSuite() {
  const generator = new SyntheticImageGenerator();
  const testImages = [];

  // Standard test images
  const configs = [
    { width: 1920, height: 1080, pattern: "gradient", format: "webp" },
    { width: 640, height: 480, pattern: "noise", format: "jpeg" },
    { width: 200, height: 200, pattern: "checkerboard", format: "png" },
    { width: 4096, height: 4096, pattern: "text", format: "png" },
    { width: 1, height: 1, pattern: "gradient", format: "webp" }, // Edge case
    { width: 8192, height: 8192, pattern: "noise", format: "jpeg" }, // Large image
  ];

  for (const config of configs) {
    const blob = await generator.generateTestImage(config);
    testImages.push({
      name: `test_${config.width}x${config.height}_${config.pattern}.${config.format}`,
      blob,
      metadata: config,
    });
  }

  return testImages;
}

2. Document Generation with Modern Libraries

// PDF generation with jsPDF for testing
import { jsPDF } from "jspdf";

class TestDocumentGenerator {
  generatePDF(config = {}) {
    const { pages = 1, content = "sample", size = "A4", orientation = "portrait" } = config;

    const doc = new jsPDF({
      orientation,
      unit: "mm",
      format: size,
    });

    for (let i = 1; i <= pages; i++) {
      if (i > 1) doc.addPage();

      switch (content) {
        case "text":
          this.addTextContent(doc, i, pages);
          break;
        case "images":
          this.addImageContent(doc, i);
          break;
        case "tables":
          this.addTableContent(doc, i);
          break;
        case "mixed":
          this.addMixedContent(doc, i);
          break;
        default:
          this.addSampleContent(doc, i);
      }
    }

    return doc.output("blob");
  }

  addTextContent(doc, pageNum, totalPages) {
    doc.setFontSize(16);
    doc.text("Test Document - Text Content", 20, 20);

    doc.setFontSize(12);
    doc.text(`Page ${pageNum} of ${totalPages}`, 20, 30);

    // Lorem ipsum content
    const loremText =
      "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " +
      "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " +
      "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.";

    const lines = doc.splitTextToSize(loremText.repeat(10), 170);
    doc.text(lines, 20, 40);
  }

  addImageContent(doc, pageNum) {
    doc.setFontSize(16);
    doc.text("Test Document - Image Content", 20, 20);

    // Add placeholder rectangles for images
    doc.setFillColor(200, 200, 200);
    doc.rect(20, 30, 80, 60, "F");
    doc.rect(110, 30, 80, 60, "F");

    doc.setFontSize(10);
    doc.text("Image Placeholder 1", 25, 65);
    doc.text("Image Placeholder 2", 115, 65);
  }

  addTableContent(doc, pageNum) {
    doc.setFontSize(16);
    doc.text("Test Document - Table Content", 20, 20);

    // Simple table
    const tableData = [
      ["Name", "Value", "Type"],
      ["File Size", "1.2 MB", "Number"],
      ["Upload Date", "2025-01-15", "Date"],
      ["Status", "Processed", "String"],
    ];

    let y = 40;
    tableData.forEach((row, index) => {
      if (index === 0) {
        doc.setFontSize(12);
        doc.setFont(undefined, "bold");
      } else {
        doc.setFontSize(10);
        doc.setFont(undefined, "normal");
      }

      doc.text(row[0], 20, y);
      doc.text(row[1], 70, y);
      doc.text(row[2], 120, y);
      y += 10;
    });
  }

  addMixedContent(doc, pageNum) {
    this.addTextContent(doc, pageNum, 1);

    // Add some geometric shapes
    doc.setDrawColor(100);
    doc.circle(150, 150, 20);
    doc.triangle(170, 180, 190, 180, 180, 160);
  }

  addSampleContent(doc, pageNum) {
    doc.text("Sample Test Document", 20, 20);
    doc.text(`Generated on: ${new Date().toISOString()}`, 20, 30);
    doc.text(`Page: ${pageNum}`, 20, 40);
  }
}

// Generate document test suite
async function generateDocumentTestSuite() {
  const generator = new TestDocumentGenerator();
  const testDocs = [];

  const configs = [
    { pages: 1, content: "text", size: "A4" },
    { pages: 5, content: "mixed", size: "A4" },
    { pages: 1, content: "images", size: "Letter" },
    { pages: 10, content: "text", size: "A3" }, // Large document
    { pages: 1, content: "tables", size: "A4" },
  ];

  for (const config of configs) {
    const blob = generator.generatePDF(config);
    testDocs.push({
      name: `test_doc_${config.pages}pages_${config.content}.pdf`,
      blob,
      metadata: config,
    });
  }

  return testDocs;
}

Edge Case and Security Test Data

1. Malformed File Generation

class MalformedFileGenerator {
  generateCorruptedImage() {
    // Create a valid PNG header but corrupt the data
    const validPNGHeader = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);

    const corruptedData = new Uint8Array(1024);
    // Fill with random data
    crypto.getRandomValues(corruptedData);

    // Combine valid header with corrupted data
    const combined = new Uint8Array(validPNGHeader.length + corruptedData.length);
    combined.set(validPNGHeader);
    combined.set(corruptedData, validPNGHeader.length);

    return new Blob([combined], { type: "image/png" });
  }

  generateOversizedMetadata() {
    // Create a minimal JPEG with massive EXIF data
    const jpegHeader = new Uint8Array([0xff, 0xd8, 0xff, 0xe1]);
    const oversizedExif = new Uint8Array(100000); // 100KB of EXIF data
    oversizedExif.fill(0x41); // Fill with 'A' characters

    const jpegEnd = new Uint8Array([0xff, 0xd9]);

    const combined = new Uint8Array(jpegHeader.length + oversizedExif.length + jpegEnd.length);
    combined.set(jpegHeader);
    combined.set(oversizedExif, jpegHeader.length);
    combined.set(jpegEnd, jpegHeader.length + oversizedExif.length);

    return new Blob([combined], { type: "image/jpeg" });
  }

  generateZipBomb() {
    // Create a small ZIP that expands to a large size
    // Note: This is for testing purposes only
    const compressedData = "50 4B 03 04 14 00 00 00 08 00"; // ZIP header
    // This would need proper ZIP implementation for real zip bomb
    return new Blob([new Uint8Array([0x50, 0x4b, 0x03, 0x04])], {
      type: "application/zip",
    });
  }

  generateFileWithNullBytes() {
    const content = "Normal content\x00\x00\x00More content";
    return new Blob([content], { type: "text/plain" });
  }

  generateFileWithLongFilename() {
    const longName = "a".repeat(300) + ".txt";
    const content = "File with extremely long filename for testing";
    return new Blob([content], { type: "text/plain" });
  }
}

2. Performance Test Data Generation

class PerformanceTestDataGenerator {
  async generateLargeFileSet(totalSizeMB = 100, fileCount = 10) {
    const files = [];
    const avgFileSizeBytes = (totalSizeMB * 1024 * 1024) / fileCount;

    for (let i = 0; i < fileCount; i++) {
      // Vary file sizes around the average
      const variance = 0.3; // 30% variance
      const sizeVariation = (Math.random() - 0.5) * 2 * variance;
      const fileSize = Math.floor(avgFileSizeBytes * (1 + sizeVariation));

      const fileData = await this.generateRandomData(fileSize);
      files.push({
        name: `perf_test_${i + 1}.bin`,
        blob: new Blob([fileData], { type: "application/octet-stream" }),
        size: fileSize,
      });
    }

    return files;
  }

  async generateRandomData(sizeBytes) {
    const chunkSize = 1024 * 1024; // 1MB chunks
    const chunks = [];
    let remaining = sizeBytes;

    while (remaining > 0) {
      const currentChunkSize = Math.min(chunkSize, remaining);
      const chunk = new Uint8Array(currentChunkSize);
      crypto.getRandomValues(chunk);
      chunks.push(chunk);
      remaining -= currentChunkSize;
    }

    return new Uint8Array(chunks.reduce((acc, chunk) => [...acc, ...chunk], []));
  }

  generateConcurrentUploadSet(concurrency = 10) {
    const files = [];

    for (let i = 0; i < concurrency; i++) {
      const content = `Concurrent upload test file ${i + 1}\n`.repeat(1000);
      files.push({
        name: `concurrent_${i + 1}.txt`,
        blob: new Blob([content], { type: "text/plain" }),
        uploadDelay: Math.random() * 1000, // Random delay 0-1000ms
      });
    }

    return files;
  }
}

Data-Driven Test Configuration

Modern Test Data Management

// test-data-config.js
export const testDataSets = {
  images: {
    valid: [
      { format: "webp", size: "small", quality: "high" },
      { format: "avif", size: "medium", quality: "medium" },
      { format: "jpeg", size: "large", quality: "low" },
    ],
    invalid: [{ type: "corrupted_header" }, { type: "oversized_metadata" }, { type: "unsupported_format" }],
    edge_cases: [
      { width: 1, height: 1 },
      { width: 65535, height: 65535 },
      { animated: true, frames: 100 },
    ],
  },
  documents: {
    valid: [
      { type: "pdf", pages: 1, size: "A4" },
      { type: "docx", pages: 10, tables: true },
      { type: "xlsx", sheets: 3, rows: 1000 },
    ],
    security: [{ type: "password_protected" }, { type: "macro_enabled" }, { type: "external_links" }],
  },
  performance: {
    load_testing: [
      { file_count: 100, total_size_mb: 50 },
      { file_count: 1000, total_size_mb: 100 },
      { file_count: 10, total_size_mb: 500 },
    ],
    stress_testing: [
      { concurrent_uploads: 50 },
      { file_size_gb: 2 },
      { rapid_uploads: { count: 100, interval_ms: 10 } },
    ],
  },
};

// Test data factory
export class TestDataFactory {
  constructor() {
    this.imageGenerator = new SyntheticImageGenerator();
    this.documentGenerator = new TestDocumentGenerator();
    this.malformedGenerator = new MalformedFileGenerator();
    this.performanceGenerator = new PerformanceTestDataGenerator();
  }

  async generateFromConfig(configPath) {
    const config = testDataSets[configPath.split(".")[0]];
    const testSet = config[configPath.split(".")[1]];

    const results = [];

    for (const testCase of testSet) {
      const testData = await this.generateTestCase(testCase);
      results.push(testData);
    }

    return results;
  }

  async generateTestCase(testCase) {
    switch (testCase.type || testCase.format) {
      case "webp":
      case "avif":
      case "jpeg":
        return this.generateImageTestCase(testCase);
      case "pdf":
        return this.generateDocumentTestCase(testCase);
      case "corrupted_header":
        return this.malformedGenerator.generateCorruptedImage();
      default:
        throw new Error(`Unknown test case type: ${testCase.type}`);
    }
  }

  async generateImageTestCase(config) {
    const sizeMap = { small: 400, medium: 1200, large: 3000 };
    const qualityMap = { low: 0.3, medium: 0.7, high: 0.9 };

    return this.imageGenerator.generateTestImage({
      width: sizeMap[config.size] || 1200,
      height: sizeMap[config.size] || 1200,
      format: config.format,
      quality: qualityMap[config.quality] || 0.8,
    });
  }

  generateDocumentTestCase(config) {
    return this.documentGenerator.generatePDF({
      pages: config.pages,
      size: config.size || "A4",
    });
  }
}

Advanced Testing Scenarios

1. Automated Test Data Pipeline

// test-data-pipeline.js
export class TestDataPipeline {
  constructor() {
    this.factory = new TestDataFactory();
    this.storage = new Map();
  }

  async generateTestSuite(scenarios) {
    const testSuite = {
      metadata: {
        generated_at: new Date().toISOString(),
        total_files: 0,
        total_size_mb: 0,
      },
      files: [],
    };

    for (const scenario of scenarios) {
      console.log(`Generating test data for: ${scenario.name}`);

      const files = await this.factory.generateFromConfig(scenario.config);

      for (const file of files) {
        const testFile = {
          id: crypto.randomUUID(),
          name: file.name || `${scenario.name}_${testSuite.files.length}.${scenario.extension}`,
          blob: file.blob,
          size: file.blob.size,
          type: file.blob.type,
          scenario: scenario.name,
          metadata: file.metadata || {},
        };

        testSuite.files.push(testFile);
        testSuite.metadata.total_files++;
        testSuite.metadata.total_size_mb += testFile.size / (1024 * 1024);
      }
    }

    return testSuite;
  }

  async saveTestSuite(testSuite, outputPath) {
    // Save metadata
    const manifest = {
      metadata: testSuite.metadata,
      files: testSuite.files.map((f) => ({
        id: f.id,
        name: f.name,
        size: f.size,
        type: f.type,
        scenario: f.scenario,
        metadata: f.metadata,
      })),
    };

    // In a real implementation, you'd save to filesystem or cloud storage
    console.log("Test suite manifest:", manifest);

    // Store files for retrieval
    for (const file of testSuite.files) {
      this.storage.set(file.id, file.blob);
    }
  }

  getTestFile(fileId) {
    return this.storage.get(fileId);
  }
}

// Usage example
const scenarios = [
  {
    name: "standard_images",
    config: "images.valid",
    extension: "webp",
  },
  {
    name: "security_tests",
    config: "images.invalid",
    extension: "png",
  },
  {
    name: "performance_load",
    config: "performance.load_testing",
    extension: "bin",
  },
];

const pipeline = new TestDataPipeline();
const testSuite = await pipeline.generateTestSuite(scenarios);
await pipeline.saveTestSuite(testSuite, "./test-data");

2. Test Data Validation

// test-data-validator.js
export class TestDataValidator {
  validateTestSuite(testSuite) {
    const validationReport = {
      valid: true,
      issues: [],
      statistics: {
        total_files: testSuite.files.length,
        total_size_mb: 0,
        formats: {},
        scenarios: {},
      },
    };

    for (const file of testSuite.files) {
      // Validate file properties
      this.validateFile(file, validationReport);

      // Update statistics
      this.updateStatistics(file, validationReport.statistics);
    }

    return validationReport;
  }

  validateFile(file, report) {
    // Check required properties
    const required = ["id", "name", "blob", "size", "type", "scenario"];
    for (const prop of required) {
      if (!file[prop]) {
        report.issues.push(`File missing required property: ${prop}`);
        report.valid = false;
      }
    }

    // Validate file size consistency
    if (file.blob && file.size !== file.blob.size) {
      report.issues.push(`Size mismatch for ${file.name}: reported ${file.size}, actual ${file.blob.size}`);
      report.valid = false;
    }

    // Validate file name
    if (file.name && !this.isValidFileName(file.name)) {
      report.issues.push(`Invalid file name: ${file.name}`);
      report.valid = false;
    }
  }

  updateStatistics(file, stats) {
    stats.total_size_mb += file.size / (1024 * 1024);

    // Count formats
    const format = file.type || "unknown";
    stats.formats[format] = (stats.formats[format] || 0) + 1;

    // Count scenarios
    stats.scenarios[file.scenario] = (stats.scenarios[file.scenario] || 0) + 1;
  }

  isValidFileName(filename) {
    // Basic filename validation
    const invalidChars = /[<>:"/\\|?*\x00-\x1f]/;
    return !invalidChars.test(filename) && filename.length <= 255;
  }
}

Integration with Modern Testing Frameworks

Vitest Integration

// __tests__/test-data.setup.js
import { beforeAll, afterAll } from "vitest";
import { TestDataPipeline } from "../src/test-utils/test-data-pipeline.js";

let testDataPipeline;
let testSuite;

beforeAll(async () => {
  testDataPipeline = new TestDataPipeline();

  const scenarios = [
    { name: "unit_test_images", config: "images.valid", extension: "webp" },
    { name: "edge_cases", config: "images.edge_cases", extension: "png" },
  ];

  testSuite = await testDataPipeline.generateTestSuite(scenarios);

  // Make test data globally available
  globalThis.testData = testSuite;
});

afterAll(() => {
  // Cleanup if needed
  globalThis.testData = null;
});

// __tests__/file-processing.test.js
import { describe, it, expect } from "vitest";
import { processFile } from "../src/file-processor.js";

describe("File Processing with Generated Test Data", () => {
  it("should process standard images correctly", async () => {
    const imageFiles = globalThis.testData.files.filter((f) => f.scenario === "unit_test_images");

    for (const file of imageFiles) {
      const result = await processFile(file.blob);

      expect(result.success).toBe(true);
      expect(result.format).toBeTruthy();
      expect(result.dimensions).toBeDefined();
    }
  });

  it("should handle edge cases gracefully", async () => {
    const edgeCaseFiles = globalThis.testData.files.filter((f) => f.scenario === "edge_cases");

    for (const file of edgeCaseFiles) {
      const result = await processFile(file.blob);

      // Edge cases might fail, but should fail gracefully
      expect(result).toBeDefined();

      if (!result.success) {
        expect(result.error).toBeDefined();
        expect(typeof result.error).toBe("string");
      }
    }
  });
});

Best Practices for 2025

1. Automated Generation

Use synthetic data generation instead of static test files
Generate test data on-demand to reduce repository size
Create parameterized test data for different scenarios

2. Comprehensive Coverage

Include edge cases (empty files, huge files, malformed data)
Test security scenarios (malicious files, injection attempts)
Generate performance test datasets automatically

3. Modern Formats

Focus on WebP, AVIF, and other modern image formats
Test new video codecs (AV1, VP9)
Include modern document formats and standards

4. Realistic Data

Generate data that matches real-world file characteristics
Include metadata patterns found in production
Simulate user behavior patterns in file uploads

5. Scalable Testing

Design test data generation to work in CI/CD pipelines
Use streaming and chunking for large file testing
Implement parallel test data generation

Creating comprehensive test data sets is essential for robust file processing systems. Modern approaches using synthetic generation, automated pipelines, and integration with testing frameworks ensure thorough validation while maintaining efficiency and security in 2025 development workflows.

FileMock