github-actions[bot]
Deploy demo from GitHub Actions - 2025-12-24 02:23:20
6cdce85
import type { DatasetExample, TaskType, Category, CodingProblem } from '@/types';
interface HFImage {
src: string;
height: number;
width: number;
}
interface HFDatasetRow {
question: string;
answer: string;
type: string;
category: string;
image: HFImage | null;
test_code: string | null;
entry_point: string | null;
source: string;
}
interface HFDatasetResponse {
rows: Array<{ row: HFDatasetRow; row_idx: number }>;
num_rows_total: number;
}
interface HFSplitInfo {
num_examples: number;
}
interface HFDatasetInfo {
dataset_info?: {
default?: {
splits?: Record<string, HFSplitInfo>;
};
};
}
export interface LoadExamplesResult {
examples: DatasetExample[];
total: number;
}
export interface FilterOptions {
type?: TaskType;
category?: Category;
hasImage?: boolean;
search?: string;
codingOnly?: boolean;
}
const HF_DATASET_API = 'https://datasets-server.huggingface.co';
const DATASET_ID = 'samuellimabraz/quantum-assistant';
const MAX_FETCH_LIMIT = 100;
export class DatasetLoader {
private splitData: Map<string, DatasetExample[]> = new Map();
private splitInfo: Record<string, number> = {};
private isLoading: Map<string, Promise<void>> = new Map();
/**
* Preload all examples for a split (fetches all data at once)
*/
async preloadSplit(split: 'train' | 'validation' | 'test'): Promise<void> {
if (this.splitData.has(split)) {
return;
}
// Prevent duplicate loading
if (this.isLoading.has(split)) {
return this.isLoading.get(split);
}
const loadPromise = this.fetchAllExamples(split);
this.isLoading.set(split, loadPromise);
try {
await loadPromise;
} finally {
this.isLoading.delete(split);
}
}
private async fetchAllExamples(split: 'train' | 'validation' | 'test'): Promise<void> {
const allExamples: DatasetExample[] = [];
let offset = 0;
let total = 0;
// First request to get total count
const firstBatch = await this.fetchBatch(split, 0, MAX_FETCH_LIMIT);
allExamples.push(...firstBatch.examples);
total = firstBatch.total;
offset = firstBatch.examples.length;
// Fetch remaining batches
while (offset < total) {
const batch = await this.fetchBatch(split, offset, MAX_FETCH_LIMIT);
allExamples.push(...batch.examples);
offset += batch.examples.length;
if (batch.examples.length < MAX_FETCH_LIMIT) break;
}
this.splitData.set(split, allExamples);
this.splitInfo[split] = allExamples.length;
}
private async fetchBatch(
split: string,
offset: number,
limit: number
): Promise<{ examples: DatasetExample[]; total: number }> {
const url = `${HF_DATASET_API}/rows?dataset=${encodeURIComponent(DATASET_ID)}&config=default&split=${split}&offset=${offset}&length=${limit}`;
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to load dataset: ${response.status}`);
}
const data: HFDatasetResponse = await response.json();
const examples: DatasetExample[] = data.rows.map((item) => {
const row = item.row;
return {
id: `${split}-${item.row_idx}`,
question: row.question,
answer: row.answer,
type: row.type as TaskType,
category: row.category as Category,
imageUrl: row.image?.src || undefined,
hasImage: row.image !== null,
testCode: row.test_code || undefined,
entryPoint: row.entry_point || undefined,
source: row.source,
};
});
return { examples, total: data.num_rows_total };
}
/**
* Check if a split is loaded
*/
isLoaded(split: 'train' | 'validation' | 'test'): boolean {
return this.splitData.has(split);
}
/**
* Get loading progress (for UI feedback)
*/
isCurrentlyLoading(split: 'train' | 'validation' | 'test'): boolean {
return this.isLoading.has(split);
}
/**
* Get all examples for a split (must be preloaded first)
*/
getAllExamples(split: 'train' | 'validation' | 'test'): DatasetExample[] {
return this.splitData.get(split) || [];
}
/**
* Get coding problems from loaded data
*/
getCodingProblems(split: 'train' | 'validation' | 'test'): CodingProblem[] {
const examples = this.splitData.get(split) || [];
return examples.filter(
(e): e is CodingProblem =>
e.testCode !== undefined &&
e.entryPoint !== undefined &&
(e.type === 'function_completion' || e.type === 'code_generation')
);
}
/**
* Filter and paginate locally loaded data
*/
filterExamples(
split: 'train' | 'validation' | 'test',
filters: FilterOptions,
limit: number = 50,
offset: number = 0
): LoadExamplesResult {
let examples = filters.codingOnly
? this.getCodingProblems(split)
: this.getAllExamples(split);
// Apply filters
if (filters.type) {
examples = examples.filter((e) => e.type === filters.type);
}
if (filters.category) {
examples = examples.filter((e) => e.category === filters.category);
}
if (filters.hasImage !== undefined) {
examples = examples.filter((e) => e.hasImage === filters.hasImage);
}
if (filters.search) {
const searchLower = filters.search.toLowerCase();
examples = examples.filter(
(e) =>
e.question.toLowerCase().includes(searchLower) ||
e.answer.toLowerCase().includes(searchLower)
);
}
const total = examples.length;
const paginated = examples.slice(offset, offset + limit);
return { examples: paginated, total };
}
/**
* Get split information
*/
async getSplitInfo(): Promise<Record<string, number>> {
// Return cached if available
if (Object.keys(this.splitInfo).length > 0) {
return this.splitInfo;
}
const url = `${HF_DATASET_API}/info?dataset=${encodeURIComponent(DATASET_ID)}`;
try {
const response = await fetch(url);
if (!response.ok) {
return { train: 8366, validation: 1247, test: 1291 };
}
const data: HFDatasetInfo = await response.json();
const splits = data.dataset_info?.default?.splits || {};
const result: Record<string, number> = {};
for (const [name, info] of Object.entries(splits)) {
result[name] = info.num_examples || 0;
}
this.splitInfo = result;
return result;
} catch {
return { train: 8366, validation: 1247, test: 1291 };
}
}
/**
* Clear cache
*/
clearCache(): void {
this.splitData.clear();
this.splitInfo = {};
}
}
export const datasetLoader = new DatasetLoader();