apache · imbajin · Jun 8, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 5, 2026
diff --git a/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/models/base_prompt_config.py
@@ -78,6 +78,7 @@ class BasePromptConfig:
     text2gql_graph_schema: str = ""
     gremlin_generate_prompt: str = ""
     doc_input_text: str = ""
+    graph_extract_split_type: str = "document"
     _language_generated: str = ""
     generate_extract_prompt_template: str = ""
 
@@ -136,6 +137,7 @@ def to_literal(val):
             "keywords_extract_prompt": to_literal(self.keywords_extract_prompt),
             "gremlin_generate_prompt": to_literal(self.gremlin_generate_prompt),
             "doc_input_text": to_literal(self.doc_input_text),
+            "graph_extract_split_type": to_literal(self.graph_extract_split_type),
             "_language_generated": str(self.llm_settings.language).lower().strip(),
             "generate_extract_prompt_template": to_literal(self.generate_extract_prompt_template),
         }

diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
@@ -44,12 +44,17 @@
 )
 
 
-def store_prompt(doc, schema, example_prompt):
-    # update env variables: doc, schema and example_prompt
-    if prompt.doc_input_text != doc or prompt.graph_schema != schema or prompt.extract_graph_prompt != example_prompt:
+def store_prompt(doc, schema, example_prompt, graph_extract_split_type="document"):
+    if (
+        prompt.doc_input_text != doc
+        or prompt.graph_schema != schema
+        or prompt.extract_graph_prompt != example_prompt
+        or prompt.graph_extract_split_type != graph_extract_split_type
+    ):
         prompt.doc_input_text = doc
         prompt.graph_schema = schema
         prompt.extract_graph_prompt = example_prompt
+        prompt.graph_extract_split_type = graph_extract_split_type
         prompt.update_yaml_file()
 
 
@@ -270,6 +275,12 @@ def create_vector_graph_block():
                     graph_data_btn0 = gr.Button("Clear Graph Data", size="sm")
 
             vector_import_bt = gr.Button("Import into Vector", variant="primary")
+            graph_split_type = gr.Dropdown(
+                choices=["document", "paragraph", "sentence"],
+                value=prompt.graph_extract_split_type,
+                label="Graph Extraction Split Type",
+                info=("document keeps the current behavior; paragraph/sentence split long docs before extraction."),
+            )
             graph_extract_bt = gr.Button("Extract Graph Data (1)", variant="primary")
             graph_loading_bt = gr.Button("Load into GraphDB (2)", interactive=True)
             graph_index_rebuild_bt = gr.Button("Update Vid Embedding")
@@ -300,48 +311,54 @@ def create_vector_graph_block():
 
         vector_index_btn0.click(get_vector_index_info, outputs=out).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
         vector_index_btn1.click(clean_vector_index).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
         vector_import_bt.click(build_vector_index, inputs=[input_file, input_text], outputs=out).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
         graph_index_btn0.click(get_graph_index_info, outputs=out).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
         graph_index_btn1.click(clean_all_graph_index).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
         graph_data_btn0.click(clean_all_graph_data).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
         graph_index_rebuild_bt.click(update_vid_embedding, outputs=out).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
 
         # origin_out = gr.Textbox(visible=False)
         graph_extract_bt.click(
             extract_graph,
-            inputs=[input_file, input_text, input_schema, info_extract_template],
+            inputs=[
+                input_file,
+                input_text,
+                input_schema,
+                info_extract_template,
+                graph_split_type,
+            ],
             outputs=[out],
         ).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
 
         graph_loading_bt.click(import_graph_data, inputs=[out, input_schema], outputs=[out]).then(
             update_vid_embedding
         ).then(
             store_prompt,
-            inputs=[input_text, input_schema, info_extract_template],
+            inputs=[input_text, input_schema, info_extract_template, graph_split_type],
         )
 
         # TODO: we should store the examples after the user changed them.
@@ -355,6 +372,7 @@ def create_vector_graph_block():
                 input_text,
                 input_schema,
                 info_extract_template,
+                graph_split_type,
             ],  # TODO: Store the updated examples
         )
 

diff --git a/hugegraph-llm/src/hugegraph_llm/flows/graph_extract.py b/hugegraph-llm/src/hugegraph_llm/flows/graph_extract.py
@@ -21,6 +21,10 @@
 from hugegraph_llm.nodes.document_node.chunk_split import ChunkSplitNode
 from hugegraph_llm.nodes.hugegraph_node.schema import SchemaNode
 from hugegraph_llm.nodes.llm_node.extract_info import ExtractNode
+from hugegraph_llm.operators.document_op.chunk_split import (
+    SPLIT_TYPE_DOCUMENT,
+    VALID_SPLIT_TYPES,
+)
 from hugegraph_llm.state.ai_state import WkFlowInput, WkFlowState
 from hugegraph_llm.utils.log import log
 
@@ -37,22 +41,43 @@ def prepare(
         texts,
         example_prompt,
         extract_type,
+        split_type=SPLIT_TYPE_DOCUMENT,
         language="zh",
         **kwargs,
     ):
         # prepare input data
         prepared_input.texts = texts
         prepared_input.language = language
-        prepared_input.split_type = "document"
+        if split_type not in VALID_SPLIT_TYPES:
+            raise ValueError("split_type must be document, paragraph, or sentence")
+
+        prepared_input.split_type = split_type
         prepared_input.example_prompt = example_prompt
         prepared_input.schema = schema
         prepared_input.extract_type = extract_type
 
-    def build_flow(self, schema, texts, example_prompt, extract_type, language="zh", **kwargs):
+    def build_flow(
+        self,
+        schema,
+        texts,
+        example_prompt,
+        extract_type,
+        split_type=SPLIT_TYPE_DOCUMENT,
+        language="zh",
+        **kwargs,
+    ):
         pipeline = GPipeline()
         prepared_input = WkFlowInput()
         # prepare input data
-        self.prepare(prepared_input, schema, texts, example_prompt, extract_type, language)
+        self.prepare(
+            prepared_input,
+            schema,
+            texts,
+            example_prompt,
+            extract_type,
+            split_type,
+            language,
+        )
 
         pipeline.createGParam(prepared_input, "wkflow_input")
         pipeline.createGParam(WkFlowState(), "wkflow_state")
@@ -70,6 +95,8 @@ def post_deal(self, pipeline=None, **kwargs):
         res = pipeline.getGParamWithNoEmpty("wkflow_state").to_json()
         vertices = res.get("vertices", [])
         edges = res.get("edges", [])
+        chunk_count = len(res.get("chunks", []))
+        log.info("Graph extraction chunk_count: %s", chunk_count)
         if not vertices and not edges:
             log.info("Please check the schema.(The schema may not match the Doc)")
             return json.dumps(

diff --git a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py
@@ -16,6 +16,7 @@
 # under the License.
 
 
+import re
 from typing import Any, Dict, List, Literal, Optional, Union
 
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -26,6 +27,16 @@
 SPLIT_TYPE_DOCUMENT = "document"
 SPLIT_TYPE_PARAGRAPH = "paragraph"
 SPLIT_TYPE_SENTENCE = "sentence"
+VALID_SPLIT_TYPES = (
+    SPLIT_TYPE_DOCUMENT,
+    SPLIT_TYPE_PARAGRAPH,
+    SPLIT_TYPE_SENTENCE,
+)
+
+
+def _split_sentence_boundaries(text: str) -> list[str]:
+    sentence_pattern = re.compile(r"[^.!?\u3002\uff01\uff1f\uff1b;]+[.!?\u3002\uff01\uff1f\uff1b;]*")
+    return [sentence.strip() for sentence in sentence_pattern.findall(text) if sentence.strip()]
 
 
 class ChunkSplit:
@@ -56,8 +67,8 @@ def _get_text_splitter(self, split_type: str):
                 chunk_size=500, chunk_overlap=30, separators=self.separators
             ).split_text
         if split_type == SPLIT_TYPE_SENTENCE:
-            return RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=0, separators=self.separators).split_text
-        raise ValueError("Type must be paragraph, sentence, html or markdown")
+            return _split_sentence_boundaries
+        raise ValueError("split_type must be document, paragraph, or sentence")
 
     def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
         all_chunks = []

diff --git a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
@@ -24,6 +24,10 @@
 
 from hugegraph_llm.flows import FlowName
 from hugegraph_llm.flows.scheduler import SchedulerSingleton
+from hugegraph_llm.operators.document_op.chunk_split import (
+    SPLIT_TYPE_DOCUMENT,
+    VALID_SPLIT_TYPES,
+)
 
 from ..config import huge_settings
 from .hugegraph_utils import clean_hg_data
@@ -77,14 +81,28 @@ def clean_all_graph_data():
     gr.Info("Clear graph data successfully!")
 
 
-def extract_graph(input_file, input_text, schema, example_prompt) -> str:
+def extract_graph(
+    input_file,
+    input_text,
+    schema,
+    example_prompt,
+    split_type=SPLIT_TYPE_DOCUMENT,
+) -> str:
     texts = read_documents(input_file, input_text)
     scheduler = SchedulerSingleton.get_instance()
     if not schema:
         return "ERROR: please input with correct schema/format."
-
+    if split_type not in VALID_SPLIT_TYPES:
+        raise gr.Error("split_type must be document, paragraph, or sentence")
     try:
-        return scheduler.schedule_flow(FlowName.GRAPH_EXTRACT, schema, texts, example_prompt, "property_graph")
+        return scheduler.schedule_flow(
+            FlowName.GRAPH_EXTRACT,
+            schema,
+            texts,
+            example_prompt,
+            "property_graph",
+            split_type=split_type,
+        )
     except Exception as e:  # pylint: disable=broad-exception-caught
         log.error(e)
         raise gr.Error(str(e))