Tweaking and reframing yields and streaming with extra garbage collection

2026-03-22 17:15:42 +00:00 · 2026-03-22 17:15:42 +00:00 · bf9eb6efb5
commit bf9eb6efb5
parent 1eada257b9
1 changed files with 35 additions and 18 deletions
--- a/gpu_server.py
+++ b/gpu_server.py
@ -1,4 +1,5 @@
 import asyncio
+import gc
 import logging
 import os
 import json
@ -38,6 +39,7 @@ def _load_llm() -> Llama:
 def _unload_llm():
    llm = state.pop("llm", None)
    del llm
+    gc.collect()
    if cuda.is_available():
        cuda.empty_cache()
    logger.info("LLM unloaded due to inactivity")
@ -56,12 +58,13 @@ def _touch_llm():
    state["llm_last_used"] = time.monotonic()

 async def _ensure_llm() -> Llama:
-    llm = state.get("llm")
-    if llm is None:
-        if not os.path.exists(LLM_MODEL_PATH):
-            raise HTTPException(status_code=503, detail="LLM model file not found.")
-        loop = asyncio.get_event_loop()
-        state["llm"] = await loop.run_in_executor(None, _load_llm)
+    if state.get("llm") is None:
+        async with gpu_semaphore:
+            if state.get("llm") is None:
+                if not os.path.exists(LLM_MODEL_PATH):
+                    raise HTTPException(status_code=503, detail="LLM model file not found.")
+                loop = asyncio.get_event_loop()
+                state["llm"] = await loop.run_in_executor(None, _load_llm)
    _touch_llm()
    return state["llm"]

@ -225,7 +228,8 @@ async def semantic_chunk(request: Request):
            single = pad_and_normalize(single, target_dimensions=TARGET_DIMENSIONS)
            return {"chunks": [raw_text], "embeddings": single.cpu().tolist()}

-        s_embeddings = model.encode(sentences, convert_to_tensor=True)
+        with no_grad():
+            s_embeddings = model.encode(sentences, convert_to_tensor=True)
        distances = [
            1 - F.cosine_similarity(s_embeddings[i].unsqueeze(0), s_embeddings[i+1].unsqueeze(0)).item()
            for i in range(len(s_embeddings) - 1)
@ -286,20 +290,33 @@ async def chat_completions(request: Request):

    try:
        if stream:
-            def _infer_stream():
-                return llm.create_chat_completion(
-                    messages=messages,
-                    stream=True,
-                    temperature=temperature,
-                    max_tokens=max_tokens,
-                    stop=["<|eot_id|>", "<|end_of_text|>"],
-                )
+            sentinel = object()

            async def _stream_response():
+                queue: asyncio.Queue = asyncio.Queue()
+                _loop = asyncio.get_event_loop()
+
+                def _produce():
+                    try:
+                        for chunk in llm.create_chat_completion(
+                            messages=messages,
+                            stream=True,
+                            temperature=temperature,
+                            max_tokens=max_tokens,
+                            stop=["<|eot_id|>", "<|end_of_text|>"],
+                        ):
+                            _loop.call_soon_threadsafe(queue.put_nowait, chunk)
+                    finally:
+                        _loop.call_soon_threadsafe(queue.put_nowait, sentinel)
+
                async with gpu_semaphore:
-                    chunks = await loop.run_in_executor(None, lambda: list(_infer_stream()))
-                for chunk in chunks:
-                    yield f"data: {json.dumps(chunk)}\n\n"
+                    fut = _loop.run_in_executor(None, _produce)
+                    while True:
+                        item = await queue.get()
+                        if item is sentinel:
+                            break
+                        yield f"data: {json.dumps(item)}\n\n"
+                    await fut
                yield "data: [DONE]\n\n"

            return StreamingResponse(_stream_response(), media_type="text/event-stream")