diff --git a/notebooks/prepare-training-file.ipynb b/notebooks/prepare-training-file.ipynb new file mode 100644 index 0000000..56692e0 --- /dev/null +++ b/notebooks/prepare-training-file.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c9cd197e", + "metadata": {}, + "source": [ + "# Prepare Training File: Load Model & Generate Training Pairs\n", + "\n", + "This notebook loads a language model and uses it to generate structured instruction/response training pairs from any input file. The generated pairs can be used directly for fine-tuning." + ] + }, + { + "cell_type": "markdown", + "id": "556d3fe5", + "metadata": {}, + "source": [ + "## Setup: Environment Variables\n", + "\n", + "Configure CUDA and PyTorch environment variables to disable BF16 and FP16 precision reductions for stable training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25b6a3b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"CUDA_DISABLE_BF16\"] = \"1\"\n", + "os.environ[\"TORCH_CUDA_ALLOW_BF16_REDUCED_PRECISION_REDUCTION\"] = \"0\"\n", + "os.environ[\"ACCELERATE_DISABLE_FP16\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "97b9e212", + "metadata": {}, + "source": [ + "## Setup: Import Required Libraries\n", + "\n", + "Import necessary libraries including transformers, torch, datasets, python-docx, json, os, and other utilities for document processing and model loading." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d63d552", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from docx import Document\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", + "import torch\n", + "\n", + "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "id": "84e04da2", + "metadata": {}, + "source": [ + "## Setup: Configure Directory Structure\n", + "\n", + "Create and organize directory paths for storing training data, models, and intermediate outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "993ed003", + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUT_DIR = Path(\"./build/training_prep\")\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "DATA_DIR = OUTPUT_DIR / \"data\"\n", + "DATA_DIR.mkdir(exist_ok=True)\n", + "MODELS_DIR = OUTPUT_DIR / \"models\"\n", + "MODELS_DIR.mkdir(exist_ok=True)\n", + "\n", + "MODEL_CACHE_DIR = Path(\"./model/base-model\")\n", + "MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)\n", + "os.environ[\"HF_HOME\"] = str(MODEL_CACHE_DIR)\n", + "\n", + "logger.info(f\"Output directory: {OUTPUT_DIR}\")\n", + "logger.info(f\"Model cache directory: {MODEL_CACHE_DIR}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0439c534", + "metadata": {}, + "source": [ + "## Setup: Helper Functions\n", + "\n", + "Define utility functions for loading various file formats (DOCX, JSON, JSONL)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e34ff2b7", + "metadata": {}, + "outputs": [], + "source": [ + "def load_docx_file(file_path: str) -> list:\n", + " \"\"\"Load and parse a DOCX file into paragraphs.\"\"\"\n", + " logger.info(f\"Loading DOCX file: {file_path}\")\n", + " doc = Document(file_path)\n", + " paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]\n", + " logger.info(f\"Extracted {len(paragraphs)} paragraphs from {file_path}\")\n", + " return paragraphs\n", + "\n", + "\n", + "def load_json_file(file_path: str) -> list:\n", + " \"\"\"Load a JSON file (array or object).\"\"\"\n", + " logger.info(f\"Loading JSON file: {file_path}\")\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " data = json.load(f)\n", + " if isinstance(data, list):\n", + " logger.info(f\"Loaded {len(data)} items from JSON file\")\n", + " return data\n", + " elif isinstance(data, dict):\n", + " logger.info(f\"JSON file is dict, converting to list\")\n", + " return [data]\n", + " return []\n", + "\n", + "\n", + "def load_jsonl_file(file_path: str) -> list:\n", + " \"\"\"Load a JSONL file (one JSON object per line).\"\"\"\n", + " logger.info(f\"Loading JSONL file: {file_path}\")\n", + " items = []\n", + " with open(file_path, 'r', encoding='utf-8') as f:\n", + " for line in f:\n", + " if line.strip():\n", + " items.append(json.loads(line))\n", + " logger.info(f\"Loaded {len(items)} items from JSONL file\")\n", + " return items\n", + "\n", + "\n", + "def load_training_file(file_path: str) -> list:\n", + " \"\"\"Load training file based on extension.\"\"\"\n", + " ext = Path(file_path).suffix.lower()\n", + " if ext == '.docx':\n", + " return load_docx_file(file_path)\n", + " elif ext == '.json':\n", + " return load_json_file(file_path)\n", + " elif ext == '.jsonl':\n", + " return load_jsonl_file(file_path)\n", + " else:\n", + " raise ValueError(f\"Unsupported file format: {ext}\")\n", + "\n", + "\n", + "logger.info(\"Helper functions defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "3bea7ee7", + "metadata": {}, + "source": [ + "## Step 1: Load and Configure the Base Model\n", + "\n", + "Load Meta-Llama-3-8B-Instruct with 4-bit quantization for efficient pair generation. The model will read your input file and generate formatted instruction/response pairs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0348d7d6", + "metadata": {}, + "outputs": [], + "source": [ + "if not torch.cuda.is_available():\n", + " raise RuntimeError(\"CUDA not available. Please run in a GPU environment.\")\n", + "\n", + "logger.info(f\"Using GPU: {torch.cuda.get_device_name(0)}\")\n", + "\n", + "BASE_MODEL = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n", + "\n", + "logger.info(f\"Loading base model: {BASE_MODEL}\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\n", + " BASE_MODEL,\n", + " cache_dir=str(MODEL_CACHE_DIR),\n", + " local_files_only=False,\n", + ")\n", + "if tokenizer.pad_token is None:\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " BASE_MODEL,\n", + " cache_dir=str(MODEL_CACHE_DIR),\n", + " quantization_config=BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_compute_dtype=torch.float16\n", + " ),\n", + " device_map=\"auto\",\n", + " dtype=torch.float16,\n", + ")\n", + "\n", + "logger.info(\"Model loaded successfully\")" + ] + }, + { + "cell_type": "markdown", + "id": "bbb7155b", + "metadata": {}, + "source": [ + "## Step 2: Load Your Training File\n", + "\n", + "Specify the path to your training file (DOCX, JSON, or JSONL). The notebook will parse it and prepare it for pair generation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe29c8b2", + "metadata": {}, + "outputs": [], + "source": [ + "TRAINING_FILE = \"./model/data/data.docx\"\n", + "training_data = load_training_file(TRAINING_FILE)\n", + "logger.info(f\"Loaded {len(training_data)} items from training file\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70aa4949", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Loaded {len(training_data)} items\")\n", + "print(f\"First item type: {type(training_data[0])}\")\n", + "print(f\"First item (first 200 chars): {str(training_data[0])[:200]}\")\n", + "if isinstance(training_data[0], dict):\n", + " print(f\"First item keys: {list(training_data[0].keys())}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cdfdaa4d", + "metadata": {}, + "source": [ + "## Step 3: Generate Training Pairs Using the Model\n", + "\n", + "The model will read your data and generate structured instruction/response pairs using a prompt-based approach. This ensures consistent formatting for fine-tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ab365", + "metadata": {}, + "outputs": [], + "source": [ + "def format_training_sample(sample) -> str:\n", + " \"\"\"Convert a training item into a concise text description.\"\"\"\n", + " try:\n", + " if isinstance(sample, dict):\n", + " parts = []\n", + " for k, v in sample.items():\n", + " if isinstance(v, str) and v.strip():\n", + " parts.append(f\"{k}: {v.strip()}\")\n", + " return \" | \".join(parts) if parts else json.dumps(sample)\n", + " if isinstance(sample, str):\n", + " return sample.strip()\n", + " return str(sample)\n", + " except Exception:\n", + " return str(sample)\n", + "\n", + "\n", + "def get_optimal_batch_size() -> int:\n", + " \"\"\"Calculate optimal batch size based on available GPU memory.\"\"\"\n", + " if not torch.cuda.is_available():\n", + " return 5\n", + "\n", + " try:\n", + " gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)\n", + "\n", + " logger.info(f\"GPU total memory: {gpu_mem:.2f} GB\")\n", + "\n", + " if gpu_mem >= 24:\n", + " return 20\n", + " elif gpu_mem >= 16:\n", + " return 15\n", + " elif gpu_mem >= 12:\n", + " return 12\n", + " elif gpu_mem >= 8:\n", + " return 8\n", + " else:\n", + " return 5\n", + " except Exception as e:\n", + " logger.warning(f\"Could not determine GPU memory: {e}. Using conservative batch size.\")\n", + " return 5\n", + "\n", + "\n", + "def generate_pairs_with_model(training_data: list, batch_size: int = None, max_tokens: int = 2048) -> list:\n", + " \"\"\"\n", + " Use the model to generate instruction/response pairs from training data.\n", + " Processes data in batches to fit within GPU memory constraints.\n", + "\n", + " Args:\n", + " training_data: List of training items to process\n", + " batch_size: Number of items per batch (None = auto-detect based on GPU memory)\n", + " max_tokens: Maximum tokens to generate per batch (default: 2048)\n", + " \"\"\"\n", + " if batch_size is None:\n", + " batch_size = get_optimal_batch_size()\n", + "\n", + " logger.info(f\"Generating training pairs from {len(training_data)} items\")\n", + " logger.info(f\"Batch size: {batch_size}, Max tokens per batch: {max_tokens}\")\n", + "\n", + " all_pairs = []\n", + "\n", + " DEBUG_OUTPUT = False\n", + "\n", + " for i in range(0, len(training_data), batch_size):\n", + " batch = training_data[i:i+batch_size]\n", + " batch_num = i//batch_size + 1\n", + " total_batches = (len(training_data) + batch_size - 1)//batch_size\n", + "\n", + " logger.info(f\"Processing batch {batch_num}/{total_batches} ({len(batch)} items)\")\n", + "\n", + " formatted = [f\"{j+1}. {format_training_sample(item)}\" for j, item in enumerate(batch)]\n", + " data_block = \"\\n\".join(formatted)\n", + "\n", + " system_prompt = (\n", + " \"You are a JSON generator. Your task is to read content and output ONLY a valid JSON array.\\n\"\n", + " \"Each object must have exactly two fields: 'instruction' and 'response'.\\n\"\n", + " \"Do not include any text before or after the JSON array.\\n\"\n", + " \"The instruction field should be a question or task from the content.\\n\"\n", + " \"The response field should be the answer extracted from the content.\\n\"\n", + " \"Output MUST be valid JSON - nothing else.\"\n", + " )\n", + "\n", + " user_prompt = (\n", + " f\"Content to extract training pairs from:\\n{data_block}\\n\\n\"\n", + " \"Output a JSON array with instruction-response pairs. Output ONLY the JSON array, no other text:\"\n", + " )\n", + "\n", + " prompt = f\"<|im_start|>system\\n{system_prompt}<|im_end|>\\n<|im_start|>user\\n{user_prompt}<|im_end|>\\n<|im_start|>assistant\\n[\"\n", + "\n", + " try:\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n", + "\n", + " if torch.cuda.is_available():\n", + " torch.cuda.empty_cache()\n", + "\n", + " with torch.no_grad():\n", + " output = model.generate(\n", + " **inputs,\n", + " max_new_tokens=max_tokens,\n", + " do_sample=True,\n", + " temperature=0.7,\n", + " top_p=0.95,\n", + " top_k=50,\n", + " eos_token_id=tokenizer.eos_token_id,\n", + " )\n", + "\n", + " input_length = inputs.input_ids.shape[1]\n", + " generated_tokens = output[0][input_length:]\n", + " decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)\n", + "\n", + " if DEBUG_OUTPUT:\n", + " print(f\"\\n[BATCH {batch_num} RAW OUTPUT]\")\n", + " print(decoded[:500])\n", + " print(\"\\n---\")\n", + " logger.debug(f\"Model output (first 300 chars): {decoded[:300]}\")\n", + "\n", + " json_text = \"[\" + decoded\n", + "\n", + " json_start = json_text.find(\"[\")\n", + " if json_start == -1:\n", + " logger.warning(f\"No JSON array found in batch {batch_num} output\")\n", + " if DEBUG_OUTPUT:\n", + " print(f\"[BATCH {batch_num}] No '[' found in output\")\n", + " continue\n", + "\n", + " bracket_count = 0\n", + " in_string = False\n", + " escape_next = False\n", + " json_end = -1\n", + "\n", + " for idx in range(json_start, len(json_text)):\n", + " char = json_text[idx]\n", + "\n", + " if escape_next:\n", + " escape_next = False\n", + " continue\n", + "\n", + " if char == '\\\\':\n", + " escape_next = True\n", + " continue\n", + "\n", + " if char == '\"' and not escape_next:\n", + " in_string = not in_string\n", + " continue\n", + "\n", + " if not in_string:\n", + " if char == '[':\n", + " bracket_count += 1\n", + " elif char == ']':\n", + " bracket_count -= 1\n", + " if bracket_count == 0:\n", + " json_end = idx\n", + " break\n", + "\n", + " if json_end == -1:\n", + " logger.warning(f\"Failed to find JSON array boundary in batch {batch_num}\")\n", + " continue\n", + "\n", + " try:\n", + " json_text = json_text[json_start: json_end + 1]\n", + " parsed = json.loads(json_text)\n", + "\n", + " batch_pairs = 0\n", + " for item in parsed:\n", + " instr = str(item.get(\"instruction\", \"\")).strip()\n", + " resp = str(item.get(\"response\", \"\")).strip()\n", + " if instr and resp:\n", + " all_pairs.append((instr, resp))\n", + " if DEBUG_OUTPUT:\n", + " print(f\"Instruction: {instr}\\nResponse: {resp}\\n---\")\n", + " batch_pairs += 1\n", + "\n", + " logger.info(f\"Extracted {batch_pairs} pairs from batch {batch_num}\")\n", + " except json.JSONDecodeError as e:\n", + " logger.error(f\"Failed to parse JSON in batch {batch_num}: {str(e)}\")\n", + " if DEBUG_OUTPUT:\n", + " logger.debug(f\"JSON text attempted (first 500 chars): {json_text[:500]}\")\n", + "\n", + " try:\n", + " json_text_fixed = json_text.replace(',]', ']').replace(',}', '}')\n", + " parsed = json.loads(json_text_fixed)\n", + "\n", + " batch_pairs = 0\n", + " for item in parsed:\n", + " instr = str(item.get(\"instruction\", \"\")).strip()\n", + " resp = str(item.get(\"response\", \"\")).strip()\n", + " if instr and resp:\n", + " all_pairs.append((instr, resp))\n", + " if DEBUG_OUTPUT:\n", + " print(f\"Instruction: {instr}\\nResponse: {resp}\\n---\")\n", + " batch_pairs += 1\n", + "\n", + " logger.info(f\"Fixed JSON and extracted {batch_pairs} pairs from batch {batch_num}\")\n", + " except Exception as e2:\n", + " logger.error(f\"Could not fix JSON in batch {batch_num}: {str(e2)}\")\n", + " continue\n", + " except Exception as e:\n", + " logger.error(f\"Unexpected error parsing batch {batch_num}: {str(e)}\")\n", + " continue\n", + "\n", + " except RuntimeError as e:\n", + " if \"out of memory\" in str(e).lower():\n", + " logger.error(f\"OOM in batch {batch_num}. Try reducing batch_size or max_tokens.\")\n", + " if torch.cuda.is_available():\n", + " torch.cuda.empty_cache()\n", + " continue\n", + " raise\n", + "\n", + " logger.info(f\"Total pairs generated: {len(all_pairs)}\")\n", + " return all_pairs\n", + "\n", + "\n", + "training_pairs = generate_pairs_with_model(training_data, batch_size=None, max_tokens=2048)\n", + "logger.info(f\"Generated {len(training_pairs)} training pairs\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85673dcd", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"\\n{'='*80}\")\n", + "print(f\"Total training pairs generated: {len(training_pairs)}\")\n", + "print(f\"{'='*80}\\n\")\n", + "\n", + "if training_pairs:\n", + " print(\"Sample training pairs:\")\n", + " for i, (instr, resp) in enumerate(training_pairs[:3], 1):\n", + " print(f\"\\nPair {i}:\")\n", + " print(f\" Instruction: {instr[:100]}{'...' if len(instr) > 100 else ''}\")\n", + " print(f\" Response: {resp[:100]}{'...' if len(resp) > 100 else ''}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4dec03c6", + "metadata": {}, + "source": [ + "## Step 4: Save Training Data to JSONL Format\n", + "\n", + "Export the generated pairs to a JSONL file for use with fine-tuning pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0f727ee", + "metadata": {}, + "outputs": [], + "source": [ + "output_file = DATA_DIR / \"generated_training_pairs.jsonl\"\n", + "\n", + "logger.info(f\"Saving {len(training_pairs)} pairs to {output_file}\")\n", + "\n", + "with open(output_file, 'w', encoding='utf-8') as f:\n", + " for instruction, response in training_pairs:\n", + " training_pair = {\n", + " \"instruction\": instruction,\n", + " \"output\": response,\n", + " }\n", + " f.write(json.dumps(training_pair, ensure_ascii=False) + \"\\n\")\n", + "\n", + "logger.info(f\"Training data saved to {output_file}\")\n", + "print(f\"\\n✓ Training pairs saved to: {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "id": "761f92c1", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "Free GPU memory after pair generation is complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db644782", + "metadata": {}, + "outputs": [], + "source": [ + "del model\n", + "del tokenizer\n", + "import gc\n", + "gc.collect()\n", + "\n", + "if torch.cuda.is_available():\n", + " torch.cuda.empty_cache()\n", + " torch.cuda.synchronize()\n", + "\n", + "logger.info(\"GPU memory freed\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}