Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class BasePromptConfig:
text2gql_graph_schema: str = ""
gremlin_generate_prompt: str = ""
doc_input_text: str = ""
graph_extract_split_type: str = "document"
_language_generated: str = ""
generate_extract_prompt_template: str = ""

Expand Down Expand Up @@ -136,6 +137,7 @@ def to_literal(val):
"keywords_extract_prompt": to_literal(self.keywords_extract_prompt),
"gremlin_generate_prompt": to_literal(self.gremlin_generate_prompt),
"doc_input_text": to_literal(self.doc_input_text),
"graph_extract_split_type": to_literal(self.graph_extract_split_type),
"_language_generated": str(self.llm_settings.language).lower().strip(),
"generate_extract_prompt_template": to_literal(self.generate_extract_prompt_template),
}
Expand Down
44 changes: 31 additions & 13 deletions hugegraph-llm/src/hugegraph_llm/demo/rag_demo/vector_graph_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,17 @@
)


def store_prompt(doc, schema, example_prompt):
# update env variables: doc, schema and example_prompt
if prompt.doc_input_text != doc or prompt.graph_schema != schema or prompt.extract_graph_prompt != example_prompt:
def store_prompt(doc, schema, example_prompt, graph_extract_split_type="document"):
if (
prompt.doc_input_text != doc
or prompt.graph_schema != schema
or prompt.extract_graph_prompt != example_prompt
or prompt.graph_extract_split_type != graph_extract_split_type
):
prompt.doc_input_text = doc
prompt.graph_schema = schema
prompt.extract_graph_prompt = example_prompt
prompt.graph_extract_split_type = graph_extract_split_type
prompt.update_yaml_file()


Expand Down Expand Up @@ -270,6 +275,12 @@ def create_vector_graph_block():
graph_data_btn0 = gr.Button("Clear Graph Data", size="sm")

vector_import_bt = gr.Button("Import into Vector", variant="primary")
graph_split_type = gr.Dropdown(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Persist the selected split type

Evidence: the dropdown is wired into extract_graph, but the existing store_prompt() call only saves doc, schema, and example_prompt; reload also only restores those fields, and BasePromptConfig.save_to_yaml() has no split-type field.

Impact: after reload, a user who selected paragraph or sentence silently falls back to document, so the next extraction can run with different chunking than the UI state they expected.

Requested fix: save and reload this split type through the prompt config path, or make the control explicitly transient. A prompt-config round-trip test would cover the regression.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. I updated the demo prompt config path to persist graph_extract_split_type and reload it into the graph split dropdown. This should prevent the selected paragraph or sentence value from silently falling back to document after reload.

choices=["document", "paragraph", "sentence"],
value=prompt.graph_extract_split_type,
label="Graph Extraction Split Type",
info=("document keeps the current behavior; paragraph/sentence split long docs before extraction."),
)
graph_extract_bt = gr.Button("Extract Graph Data (1)", variant="primary")
graph_loading_bt = gr.Button("Load into GraphDB (2)", interactive=True)
graph_index_rebuild_bt = gr.Button("Update Vid Embedding")
Expand Down Expand Up @@ -300,48 +311,54 @@ def create_vector_graph_block():

vector_index_btn0.click(get_vector_index_info, outputs=out).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)
vector_index_btn1.click(clean_vector_index).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)
vector_import_bt.click(build_vector_index, inputs=[input_file, input_text], outputs=out).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)
graph_index_btn0.click(get_graph_index_info, outputs=out).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)
graph_index_btn1.click(clean_all_graph_index).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)
graph_data_btn0.click(clean_all_graph_data).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)
graph_index_rebuild_bt.click(update_vid_embedding, outputs=out).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)

# origin_out = gr.Textbox(visible=False)
graph_extract_bt.click(
extract_graph,
inputs=[input_file, input_text, input_schema, info_extract_template],
inputs=[
input_file,
input_text,
input_schema,
info_extract_template,
graph_split_type,
],
outputs=[out],
).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)

graph_loading_bt.click(import_graph_data, inputs=[out, input_schema], outputs=[out]).then(
update_vid_embedding
).then(
store_prompt,
inputs=[input_text, input_schema, info_extract_template],
inputs=[input_text, input_schema, info_extract_template, graph_split_type],
)

# TODO: we should store the examples after the user changed them.
Expand All @@ -355,6 +372,7 @@ def create_vector_graph_block():
input_text,
input_schema,
info_extract_template,
graph_split_type,
], # TODO: Store the updated examples
)

Expand Down
33 changes: 30 additions & 3 deletions hugegraph-llm/src/hugegraph_llm/flows/graph_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
from hugegraph_llm.nodes.document_node.chunk_split import ChunkSplitNode
from hugegraph_llm.nodes.hugegraph_node.schema import SchemaNode
from hugegraph_llm.nodes.llm_node.extract_info import ExtractNode
from hugegraph_llm.operators.document_op.chunk_split import (
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Sort the import block

Evidence: uv run --project .. --extra llm --extra dev ruff check src/hugegraph_llm/flows/graph_extract.py src/hugegraph_llm/utils/graph_index_utils.py src/hugegraph_llm/operators/document_op/chunk_split.py src/tests/document/test_graph_extract_configurable_split.py fails with I001 Import block is un-sorted or un-formatted on this file.

Impact: the PR will fail the repository lint gate even though the targeted tests pass.

Requested fix: run Ruff import sorting on this file and commit the formatted import order.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I ran Ruff import sorting and formatting on the touched files, and the import block has been reordered by ruff check --fix.

SPLIT_TYPE_DOCUMENT,
VALID_SPLIT_TYPES,
)
from hugegraph_llm.state.ai_state import WkFlowInput, WkFlowState
from hugegraph_llm.utils.log import log

Expand All @@ -37,22 +41,43 @@ def prepare(
texts,
example_prompt,
extract_type,
split_type=SPLIT_TYPE_DOCUMENT,
language="zh",
**kwargs,
):
# prepare input data
prepared_input.texts = texts
prepared_input.language = language
prepared_input.split_type = "document"
if split_type not in VALID_SPLIT_TYPES:
raise ValueError("split_type must be document, paragraph, or sentence")

prepared_input.split_type = split_type
prepared_input.example_prompt = example_prompt
prepared_input.schema = schema
prepared_input.extract_type = extract_type

def build_flow(self, schema, texts, example_prompt, extract_type, language="zh", **kwargs):
def build_flow(
self,
schema,
texts,
example_prompt,
extract_type,
split_type=SPLIT_TYPE_DOCUMENT,
language="zh",
**kwargs,
):
pipeline = GPipeline()
prepared_input = WkFlowInput()
# prepare input data
self.prepare(prepared_input, schema, texts, example_prompt, extract_type, language)
self.prepare(
prepared_input,
schema,
texts,
example_prompt,
extract_type,
split_type,
language,
)

pipeline.createGParam(prepared_input, "wkflow_input")
pipeline.createGParam(WkFlowState(), "wkflow_state")
Expand All @@ -70,6 +95,8 @@ def post_deal(self, pipeline=None, **kwargs):
res = pipeline.getGParamWithNoEmpty("wkflow_state").to_json()
vertices = res.get("vertices", [])
edges = res.get("edges", [])
chunk_count = len(res.get("chunks", []))
log.info("Graph extraction chunk_count: %s", chunk_count)
if not vertices and not edges:
log.info("Please check the schema.(The schema may not match the Doc)")
return json.dumps(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.


import re
from typing import Any, Dict, List, Literal, Optional, Union

from langchain_text_splitters import RecursiveCharacterTextSplitter
Expand All @@ -26,6 +27,16 @@
SPLIT_TYPE_DOCUMENT = "document"
SPLIT_TYPE_PARAGRAPH = "paragraph"
SPLIT_TYPE_SENTENCE = "sentence"
VALID_SPLIT_TYPES = (
SPLIT_TYPE_DOCUMENT,
SPLIT_TYPE_PARAGRAPH,
SPLIT_TYPE_SENTENCE,
)


def _split_sentence_boundaries(text: str) -> list[str]:
sentence_pattern = re.compile(r"[^.!?\u3002\uff01\uff1f\uff1b;]+[.!?\u3002\uff01\uff1f\uff1b;]*")
return [sentence.strip() for sentence in sentence_pattern.findall(text) if sentence.strip()]


class ChunkSplit:
Expand Down Expand Up @@ -56,8 +67,8 @@ def _get_text_splitter(self, split_type: str):
chunk_size=500, chunk_overlap=30, separators=self.separators
).split_text
if split_type == SPLIT_TYPE_SENTENCE:
return RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=0, separators=self.separators).split_text
raise ValueError("Type must be paragraph, sentence, html or markdown")
return _split_sentence_boundaries
raise ValueError("split_type must be document, paragraph, or sentence")

def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
all_chunks = []
Expand Down
24 changes: 21 additions & 3 deletions hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

from hugegraph_llm.flows import FlowName
from hugegraph_llm.flows.scheduler import SchedulerSingleton
from hugegraph_llm.operators.document_op.chunk_split import (
SPLIT_TYPE_DOCUMENT,
VALID_SPLIT_TYPES,
)

from ..config import huge_settings
from .hugegraph_utils import clean_hg_data
Expand Down Expand Up @@ -77,14 +81,28 @@ def clean_all_graph_data():
gr.Info("Clear graph data successfully!")


def extract_graph(input_file, input_text, schema, example_prompt) -> str:
def extract_graph(
input_file,
input_text,
schema,
example_prompt,
split_type=SPLIT_TYPE_DOCUMENT,
) -> str:
texts = read_documents(input_file, input_text)
scheduler = SchedulerSingleton.get_instance()
if not schema:
return "ERROR: please input with correct schema/format."

if split_type not in VALID_SPLIT_TYPES:
raise gr.Error("split_type must be document, paragraph, or sentence")
try:
return scheduler.schedule_flow(FlowName.GRAPH_EXTRACT, schema, texts, example_prompt, "property_graph")
return scheduler.schedule_flow(
FlowName.GRAPH_EXTRACT,
schema,
texts,
example_prompt,
"property_graph",
split_type=split_type,
)
except Exception as e: # pylint: disable=broad-exception-caught
log.error(e)
raise gr.Error(str(e))
Expand Down
Loading
Loading