""" cohelp_full.py — Minimal single-file script to: create tiny example dataset, fine-tune a causal LM (GPT-2 style), run a CLI chat, launch a Gradio demo, and upload to HF Hub. Features (all in one file): - small example dataset (jsonl) generated when needed - Trainer-based fine-tuning - a naive chat-friendly prompt formatting - simple loss-masking so only assistant tokens produce loss (naive implementation) - lightweight Gradio demo for interactive testing - upload_to_hub function to push model + tokenizer + model_card Caveats: - This is an educational minimal repo. For production, use larger datasets, handle tokenization / padding carefully, prefer LoRA/PEFT, and add safety filters. Usage examples: - Train: python cohelp_full.py --do_train --output_dir outputs/cohelp - Demo (local): python cohelp_full.py --do_demo --model outputs/cohelp - CLI chat: python cohelp_full.py --do_chat --model outputs/cohelp - Upload: python cohelp_full.py --upload --repo_id your-user/cohelp """ import os import argparse import json from pathlib import Path from typing import List import torch from datasets import load_dataset, Dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, ) # -------- CONFIGURATION -------- BASE_MODEL = "gpt2" # change to distilgpt2 or other causal model SPECIAL_TOKENS = { "bos_token": "<|bos|>", "eos_token": "<|eos|>", "pad_token": "<|pad|>", "additional_special_tokens": ["<|user|>", "<|assistant|>"] } DEFAULT_MAX_LENGTH = 512 # -------- Helpers: prompt formatting -------- def build_prompt(history: List[dict], user_input: str = None): """Build a prompt string from history and optional new user_input. history is a list of dicts like {"role":"user"/"assistant", "text":...} """ parts = [SPECIAL_TOKENS["bos_token"]] for turn in history: if turn["role"] == "user": parts.append("<|user|>") parts.append(turn["text"]) else: parts.append("<|assistant|>") parts.append(turn["text"]) if user_input is not None: parts.append("<|user|>") parts.append(user_input) parts.append("<|assistant|>") return " \n".join(parts) # -------- Tiny example dataset generator -------- EXAMPLE_JSONL = "cohelp_example.jsonl" EXAMPLE_LINES = [ {"role": "user", "text": "Hi, who are you?"}, histor