Renamed model, updated ingestion tweaks and remove unncessary fields

This commit is contained in:
Viswamedha Nalabotu 2026-03-22 15:35:21 +00:00
parent c6f7f8917a
commit df8603fa6e
17 changed files with 62 additions and 72 deletions

View file

@ -1,19 +1,19 @@
from django.contrib import admin
from django.utils.translation import gettext_lazy as _
from apps.knowledge.models import RoleRagDocument, TrainingFile
from apps.knowledge.models import KnowledgeChunk, TrainingFile
@admin.register(TrainingFile)
class TrainingFileAdmin(admin.ModelAdmin):
list_display = ('file_name', 'organization', 'role', 'status', 'is_processed', 'uploaded_by', 'created_at')
list_filter = ('status', 'is_processed', 'organization', 'created_at')
list_display = ('file_name', 'organization', 'role', 'status', 'uploaded_by', 'created_at')
list_filter = ('status', 'organization', 'created_at')
search_fields = ('file_name', 'organization__name', 'role__name', 'uploaded_by__email_address')
raw_id_fields = ('organization', 'role', 'uploaded_by')
readonly_fields = ('uuid', 'file_size', 'file_type', 'created_at', 'updated_at')
ordering = ('-created_at',)
@admin.register(RoleRagDocument)
class RoleRagDocumentAdmin(admin.ModelAdmin):
@admin.register(KnowledgeChunk)
class KnowledgeChunkAdmin(admin.ModelAdmin):
list_display = ('organization', 'role', 'chunk_index', 'training_file', 'is_active', 'created_at')
list_filter = ('is_active', 'organization', 'created_at')
search_fields = ('content', 'organization__name', 'role__name', 'training_file__file_name')

View file

@ -29,7 +29,6 @@ class Migration(migrations.Migration):
('file_type', models.CharField(max_length=50)),
('description', models.TextField(blank=True, default='')),
('status', models.CharField(choices=[('ingesting', 'Ingesting'), ('chunked', 'Chunked'), ('embedded', 'Embedded'), ('failed', 'Failed')], default='ingesting', max_length=20)),
('is_processed', models.BooleanField(default=False)),
('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='training_files', to='accounts.organization')),
('role', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='training_files', to='accounts.role')),
('uploaded_by', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='uploaded_training_files', to=settings.AUTH_USER_MODEL)),
@ -41,7 +40,7 @@ class Migration(migrations.Migration):
},
),
migrations.CreateModel(
name='RoleRagDocument',
name='KnowledgeChunk',
fields=[
('id', models.BigAutoField(primary_key=True, serialize=False, verbose_name='ID')),
('uuid', models.UUIDField(default=uuid.uuid4, editable=False, verbose_name='UUID')),
@ -53,13 +52,13 @@ class Migration(migrations.Migration):
('metadata', models.JSONField(blank=True, default=dict)),
('chunk_index', models.IntegerField(default=0)),
('is_active', models.BooleanField(default=True)),
('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rag_documents', to='accounts.organization')),
('role', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='rag_documents', to='accounts.role')),
('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='knowledge_chunks', to='accounts.organization')),
('role', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='knowledge_chunks', to='accounts.role')),
('training_file', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='chunks', to='knowledge.trainingfile')),
],
options={
'verbose_name': 'Role RAG Document',
'verbose_name_plural': 'Role RAG Documents',
'verbose_name': 'Knowledge Chunk',
'verbose_name_plural': 'Knowledge Chunks',
},
),
]

View file

@ -30,7 +30,6 @@ class TrainingFile(IdentifierMixin, TimeStampMixin, Model):
description = TextField(blank=True, default='')
status = CharField(max_length=20, choices=STATUS_CHOICES, default='ingesting')
is_processed = BooleanField(default=False)
class Meta:
verbose_name = _("Training File")
@ -42,10 +41,10 @@ class TrainingFile(IdentifierMixin, TimeStampMixin, Model):
return f"{self.file_name} ({self.role.name})"
return f"{self.file_name} ({self.organization.name} - Organization-wide)"
class RoleRagDocument(IdentifierMixin, TimeStampMixin, Model):
class KnowledgeChunk(IdentifierMixin, TimeStampMixin, Model):
organization = ForeignKey(Organization, on_delete=CASCADE, related_name='rag_documents')
role = ForeignKey(Role, on_delete=SET_NULL, related_name='rag_documents', null=True, blank=True)
organization = ForeignKey(Organization, on_delete=CASCADE, related_name='knowledge_chunks')
role = ForeignKey(Role, on_delete=SET_NULL, related_name='knowledge_chunks', null=True, blank=True)
training_file = ForeignKey(TrainingFile, on_delete=CASCADE, related_name='chunks', null=True, blank=True)
content = TextField()
@ -58,8 +57,8 @@ class RoleRagDocument(IdentifierMixin, TimeStampMixin, Model):
is_active = BooleanField(default=True)
class Meta:
verbose_name = _("Role RAG Document")
verbose_name_plural = _("Role RAG Documents")
verbose_name = _("Knowledge Chunk")
verbose_name_plural = _("Knowledge Chunks")
def __str__(self) -> str:
if self.role_id:

View file

@ -1,7 +1,7 @@
from rest_framework.serializers import ModelSerializer, SerializerMethodField
from apps.accounts.serializers import OrganizationSerializer, RoleSerializer, UserSerializer
from apps.knowledge.models import RoleRagDocument, TrainingFile
from apps.knowledge.models import KnowledgeChunk, TrainingFile
class TrainingFileSerializer(ModelSerializer):
uploaded_by = UserSerializer(read_only=True)
@ -15,11 +15,11 @@ class TrainingFileSerializer(ModelSerializer):
fields = [
'id', 'uuid', 'organization', 'role', 'scope', 'uploaded_by', 'file', 'file_url',
'file_name', 'file_size', 'file_type', 'description',
'status', 'is_processed', 'created_at', 'updated_at'
'status', 'created_at', 'updated_at'
]
read_only_fields = [
'id', 'uuid', 'uploaded_by', 'file_size', 'file_type',
'status', 'is_processed', 'created_at', 'updated_at',
'id', 'uuid', 'uploaded_by', 'file_size', 'file_type',
'status', 'created_at', 'updated_at',
'organization', 'role', 'scope'
]
@ -32,17 +32,17 @@ class TrainingFileSerializer(ModelSerializer):
def get_scope(self, obj: TrainingFile) -> str:
return 'role' if obj.role_id else 'organization'
class RoleRagDocumentSerializer(ModelSerializer):
class KnowledgeChunkSerializer(ModelSerializer):
training_file_name = SerializerMethodField()
class Meta:
model = RoleRagDocument
model = KnowledgeChunk
fields = [
'id', 'uuid', 'role', 'training_file', 'training_file_name',
'content', 'content_hash', 'metadata', 'chunk_index',
'id', 'uuid', 'role', 'training_file', 'training_file_name',
'content', 'content_hash', 'metadata', 'chunk_index',
'is_active', 'created_at'
]
read_only_fields = ['id', 'uuid', 'content_hash', 'created_at']
def get_training_file_name(self, obj: RoleRagDocument) -> str:
def get_training_file_name(self, obj: KnowledgeChunk) -> str:
return obj.training_file.file_name if obj.training_file else None

View file

@ -8,7 +8,7 @@ from docx import Document
from httpx import Client, Timeout
from pypdf import PdfReader
from apps.knowledge.models import RoleRagDocument, TrainingFile
from apps.knowledge.models import KnowledgeChunk, TrainingFile
logger = logging.getLogger(__name__)
@ -46,7 +46,7 @@ def _get_text_chunks(text: str, size: int = 10000):
def ingest_training_file_task(self, file_uuid):
"""
Ingests a training file by extracting text, chunking it, generating embeddings via an external service,
and saving RoleRagDocument entries. Updates the file status accordingly and triggers prompt refinement.
and saving KnowledgeChunk entries. Updates the file status accordingly and triggers prompt refinement.
"""
try:
file_obj = TrainingFile.objects.get(uuid=file_uuid)
@ -83,7 +83,7 @@ def ingest_training_file_task(self, file_uuid):
embeddings = result['embeddings']
for chunk_text, embedding in zip(chunks, embeddings):
all_documents.append(RoleRagDocument(
all_documents.append(KnowledgeChunk(
organization=file_obj.organization,
role=file_obj.role,
training_file=file_obj,
@ -98,13 +98,14 @@ def ingest_training_file_task(self, file_uuid):
},
))
chunk_counter += 1
existing_hashes = set(KnowledgeChunk.objects.filter(training_file=file_obj).values_list('content_hash', flat=True))
new_documents = [d for d in all_documents if d.content_hash not in existing_hashes]
with transaction.atomic():
RoleRagDocument.objects.bulk_create(all_documents)
KnowledgeChunk.objects.bulk_create(new_documents)
file_obj.status = 'embedded'
file_obj.is_processed = True
file_obj.save()
if file_obj.role_id:
@ -142,7 +143,7 @@ def update_agent_prompts_from_file_task(self, role_uuid: str):
}
chunk_texts = list(
RoleRagDocument.objects.filter(role=role, is_active=True)
KnowledgeChunk.objects.filter(role=role, is_active=True)
.order_by('training_file_id', 'chunk_index')
.values_list('content', flat=True)[:30]
)

View file

@ -8,7 +8,7 @@ from rest_framework.status import HTTP_200_OK, HTTP_201_CREATED, HTTP_204_NO_CON
from rest_framework.test import APIClient
from apps.accounts.models import Organization, Role
from apps.knowledge.models import RoleRagDocument, TrainingFile, trigger_ingestion
from apps.knowledge.models import KnowledgeChunk, TrainingFile, trigger_ingestion
User = get_user_model()
@ -54,7 +54,7 @@ class KnowledgeApiTests(TestCase):
file_size=7,
file_type='text/plain',
)
self.rag_doc = RoleRagDocument.objects.create(
self.chunk = KnowledgeChunk.objects.create(
organization=self.org,
role=self.role,
training_file=self.training_file,
@ -110,14 +110,14 @@ class KnowledgeApiTests(TestCase):
response = self.client.delete(f'/api/training-file/{self.training_file.uuid}/')
self.assertEqual(response.status_code, HTTP_204_NO_CONTENT)
def test_role_rag_document_list_path(self):
def test_knowledge_chunk_list_path(self):
self.client.force_authenticate(self.member)
response = self.client.get('/api/role-rag-document/')
response = self.client.get('/api/knowledge-chunk/')
self.assertEqual(response.status_code, HTTP_200_OK)
def test_role_rag_document_retrieve_path(self):
def test_knowledge_chunk_retrieve_path(self):
self.client.force_authenticate(self.member)
response = self.client.get(f'/api/role-rag-document/{self.rag_doc.uuid}/')
response = self.client.get(f'/api/knowledge-chunk/{self.chunk.uuid}/')
self.assertEqual(response.status_code, HTTP_200_OK)
def test_training_file_list_for_non_member_returns_empty(self):

View file

@ -4,7 +4,7 @@ from django.db.models.signals import post_save
from django.test import TestCase
from apps.accounts.models import Organization, Role
from apps.knowledge.models import RoleRagDocument, TrainingFile, trigger_ingestion
from apps.knowledge.models import KnowledgeChunk, TrainingFile, trigger_ingestion
User = get_user_model()
@ -52,7 +52,7 @@ class KnowledgeModelTests(TestCase):
self.assertEqual(training_file.file_type, 'text/plain')
self.assertEqual(training_file.description, 'sample')
self.assertEqual(training_file.status, 'ingesting')
self.assertFalse(training_file.is_processed)
self.assertEqual(training_file.status, 'ingesting')
self.assertIsNotNone(training_file.id)
self.assertIsNotNone(training_file.uuid)
@ -61,7 +61,7 @@ class KnowledgeModelTests(TestCase):
self.assertIn('training.txt (Analyst)', str(training_file))
def test_role_rag_document_fields_and_defaults(self):
def test_knowledge_chunk_fields_and_defaults(self):
uploaded = SimpleUploadedFile('base.txt', b'base', content_type='text/plain')
training_file = TrainingFile.objects.create(
organization=self.org,
@ -72,7 +72,7 @@ class KnowledgeModelTests(TestCase):
file_size=4,
file_type='text/plain',
)
document = RoleRagDocument.objects.create(
document = KnowledgeChunk.objects.create(
organization=self.org,
role=self.role,
training_file=training_file,

View file

@ -8,8 +8,8 @@ from rest_framework.viewsets import ModelViewSet, ReadOnlyModelViewSet
from apps.accounts.models import Organization, Role
from apps.accounts.permissions import can_manage_organization
from apps.knowledge.models import RoleRagDocument, TrainingFile
from apps.knowledge.serializers import RoleRagDocumentSerializer, TrainingFileSerializer
from apps.knowledge.models import KnowledgeChunk, TrainingFile
from apps.knowledge.serializers import KnowledgeChunkSerializer, TrainingFileSerializer
from apps.knowledge.tasks import ingest_training_file_task, update_agent_prompts_from_file_task
class TrainingFileViewSet(ModelViewSet):
@ -93,8 +93,7 @@ class TrainingFileViewSet(ModelViewSet):
raise ValidationError({'status': 'Only failed files can be retried.'})
instance.status = 'ingesting'
instance.is_processed = False
instance.save(update_fields=['status', 'is_processed'])
instance.save(update_fields=['status'])
ingest_training_file_task.delay(str(instance.uuid))
serializer = self.get_serializer(instance)
@ -112,15 +111,15 @@ class TrainingFileViewSet(ModelViewSet):
update_agent_prompts_from_file_task.delay(role_uuid)
return response
class RoleRagDocumentViewSet(ReadOnlyModelViewSet):
queryset = RoleRagDocument.objects.all()
serializer_class = RoleRagDocumentSerializer
class KnowledgeChunkViewSet(ReadOnlyModelViewSet):
queryset = KnowledgeChunk.objects.all()
serializer_class = KnowledgeChunkSerializer
permission_classes = [IsAuthenticated]
lookup_field = 'uuid'
def get_queryset(self):
user = self.request.user
queryset = RoleRagDocument.objects.filter(
queryset = KnowledgeChunk.objects.filter(
Q(organization__owner=user) |
Q(organization__members=user)
).distinct()

View file

@ -34,7 +34,7 @@ class OnboardingSessionAdmin(admin.ModelAdmin):
fieldsets = (
(None, {'fields': ('user', 'role', 'status', 'uuid')}),
(_('Live State'), {'fields': ('state', 'active_configs')}),
(_('Live State'), {'fields': ('state',)}),
(_('Timestamps'), {'fields': ('completed_at', 'created_at', 'updated_at')}),
)

View file

@ -8,7 +8,7 @@ from django.db.models import Q
from pgvector.django import CosineDistance
from apps.accounts.models import Role
from apps.knowledge.models import RoleRagDocument, TrainingFile
from apps.knowledge.models import KnowledgeChunk, TrainingFile
from apps.onboarding.models import OnboardingSession
logger = logging.getLogger(__name__)
@ -105,7 +105,7 @@ class MCPRouter:
logger.warning('MCP search_knowledge_documents role not found: role_uuid=%s', role_uuid)
return []
docs = RoleRagDocument.objects.filter(
docs = KnowledgeChunk.objects.filter(
organization=role.organization,
embedding__isnull=False,
is_active=True,
@ -205,7 +205,7 @@ class MCPRouter:
files = list(
TrainingFile.objects.filter(
organization=role.organization,
is_processed=True,
status='embedded',
).filter(
Q(role__uuid=role_uuid) | Q(role__isnull=True)
).values('file_name', 'description', 'file_type')[:20]

View file

@ -59,7 +59,6 @@ class Migration(migrations.Migration):
('updated_at', models.DateTimeField(auto_now=True, verbose_name='Updated At')),
('status', models.CharField(choices=[('active', 'Active'), ('completed', 'Completed'), ('paused', 'Paused')], default='active', max_length=20, verbose_name='Session Status')),
('state', models.JSONField(blank=True, default=dict, verbose_name='Session State')),
('active_configs', models.JSONField(default=dict, verbose_name='Active Configs')),
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='Completed At')),
('flow', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='sessions', to='onboarding.onboardingflow', verbose_name='Onboarding Flow')),
('role', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='onboarding_sessions', to='accounts.role', verbose_name='Target Role')),

View file

@ -55,7 +55,6 @@ class OnboardingSession(IdentifierMixin, TimeStampMixin, Model):
status = CharField(max_length=20, choices=STATUS_CHOICES, default='active', verbose_name=_("Session Status"))
state = JSONField(default=dict, blank=True, verbose_name=_("Session State"))
active_configs = JSONField(default=dict, verbose_name=_("Active Configs"))
completed_at = DateTimeField(null=True, blank=True, verbose_name=_("Completed At"))
class Meta:

View file

@ -38,7 +38,7 @@ class OnboardingSessionSerializer(ModelSerializer):
model = OnboardingSession
fields = [
'id', 'uuid', 'user', 'role', 'flow', 'status', 'state',
'active_configs', 'logs', 'completed_at', 'created_at',
'logs', 'completed_at', 'created_at',
'updated_at', 'progress_percentage'
]
read_only_fields = ['id', 'uuid', 'user', 'completed_at', 'created_at', 'updated_at']

View file

@ -50,7 +50,7 @@ class OnboardingApiTests(TestCase):
role=self.role,
flow=self.flow,
state={'progress': 10},
active_configs={},
)
self.log = AgentInteractionLog.objects.create(
session=self.session,
@ -210,7 +210,6 @@ class OnboardingApiTests(TestCase):
'role': str(self.role.uuid),
'status': 'active',
'state': {'progress': 0},
'active_configs': {},
}, format='json')
self.assertEqual(response.status_code, HTTP_400_BAD_REQUEST)
@ -228,7 +227,6 @@ class OnboardingApiTests(TestCase):
'role': str(self.role.uuid),
'status': 'paused',
'state': {'progress': 20},
'active_configs': {'knowledge': 'enabled'},
},
format='json',
)
@ -249,7 +247,7 @@ class OnboardingApiTests(TestCase):
user=self.member,
role=self.role,
state={},
active_configs={},
)
response = self.client.delete(f'/api/onboarding-session/{deletable.uuid}/')
self.assertEqual(response.status_code, HTTP_204_NO_CONTENT)
@ -610,7 +608,7 @@ class OnboardingApiTests(TestCase):
role=self.role,
flow=flow_to_delete,
state={'flow_uuid': str(flow_to_delete.uuid)},
active_configs={},
)
untouched_flow = OnboardingFlow.objects.create(
title='Keep Me and Sessions',
@ -622,7 +620,7 @@ class OnboardingApiTests(TestCase):
role=self.role,
flow=untouched_flow,
state={'flow_uuid': str(untouched_flow.uuid)},
active_configs={},
)
response = self.client.delete(f'/api/onboarding-flow/{flow_to_delete.uuid}/')
@ -716,7 +714,7 @@ class OnboardingApiTests(TestCase):
user=self.manager,
role=self.role,
state={'flow_uuid': str(self.flow.uuid), 'progress_percentage': 10},
active_configs={},
)
self.client.force_authenticate(self.manager)

View file

@ -45,14 +45,12 @@ class OnboardingModelTests(TestCase):
role=self.role,
status='active',
state={'progress': 25},
active_configs={'knowledge': 'enabled'},
)
self.assertEqual(session.user, self.user)
self.assertEqual(session.role, self.role)
self.assertEqual(session.status, 'active')
self.assertEqual(session.state, {'progress': 25})
self.assertEqual(session.active_configs, {'knowledge': 'enabled'})
self.assertIsNone(session.completed_at)
self.assertIsNotNone(session.id)
self.assertIsNotNone(session.uuid)
@ -69,7 +67,6 @@ class OnboardingModelTests(TestCase):
user=self.user,
role=self.role,
state={},
active_configs={},
)
log = AgentInteractionLog.objects.create(
session=session,

View file

@ -178,7 +178,6 @@ class OnboardingFlowViewSet(RequestParamMixin, ModelViewSet):
'progress': 0,
'current_step': 'intro',
},
active_configs={},
)
serializer = OnboardingSessionSerializer(session)
return Response(serializer.data, status=HTTP_201_CREATED)

View file

@ -1,7 +1,7 @@
from rest_framework.routers import DefaultRouter
from apps.accounts.viewsets import UserViewSet, OrganizationViewSet, InviteViewSet, RoleViewSet
from apps.knowledge.viewsets import TrainingFileViewSet, RoleRagDocumentViewSet
from apps.knowledge.viewsets import KnowledgeChunkViewSet, TrainingFileViewSet
from apps.onboarding.viewsets import AgentConfigViewSet, OnboardingFlowViewSet, OnboardingSessionViewSet, AgentInteractionLogViewSet
router = DefaultRouter()
@ -11,7 +11,7 @@ router.register(r'organization', OrganizationViewSet)
router.register(r'invite', InviteViewSet)
router.register(r'role', RoleViewSet)
router.register(r'training-file', TrainingFileViewSet)
router.register(r'role-rag-document', RoleRagDocumentViewSet)
router.register(r'knowledge-chunk', KnowledgeChunkViewSet)
router.register(r'agent-config', AgentConfigViewSet)
router.register(r'onboarding-flow', OnboardingFlowViewSet)
router.register(r'onboarding-session', OnboardingSessionViewSet)