diff --git a/train_my_qlora.py b/train_my_qlora.py new file mode 100644 index 0000000..2ab1e3f --- /dev/null +++ b/train_my_qlora.py @@ -0,0 +1,88 @@ +import os +from datasets import load_dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from transformers import DataCollatorForLanguageModeling, BitsAndBytesConfig +from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model +from datasets import load_dataset + +# Configurações +model_name = "mistralai/Mistral-7B-Instruct-v0.2" +data_file_path = "../datasets/qa_dataset.json" # <<== Ajuste aqui o nome do seu arquivo JSON +output_dir = "./qlora-output" +max_length = 512 + +# Quantização com bitsandbytes +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype="float16" +) + +# Tokenizador e modelo +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token + +model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True +) +model = prepare_model_for_kbit_training(model) + +# LoRA +peft_config = LoraConfig( + r=8, + lora_alpha=16, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM" +) +model = get_peft_model(model, peft_config) + +# Dataset local em JSON +dataset = load_dataset("json", data_files=data_file_path, split="train") + +# Tokenização +def tokenize(example): + tokenized = tokenizer( + example["text"], + truncation=True, + max_length=max_length, + padding="max_length" + ) + tokenized["labels"] = tokenized["input_ids"].copy() + return tokenized + +tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names) + +# Data collator +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + +# Argumentos de treino +training_args = TrainingArguments( + output_dir=output_dir, + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + num_train_epochs=3, + learning_rate=2e-4, + fp16=True, + logging_steps=10, + save_strategy="epoch", + report_to="none" +) + +# Trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_dataset, + data_collator=data_collator +) + +# Treino +trainer.train() +model.save_pretrained(output_dir) +tokenizer.save_pretrained(output_dir)