Skip to content

Commit ae45e95

Browse files
Improve Text Chunker (#168)
1 parent 370d082 commit ae45e95

File tree

10 files changed

+860
-387
lines changed

10 files changed

+860
-387
lines changed

deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,6 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
283283

284284
def get_semantic_chunker_skill(
285285
self,
286-
num_surrounding_sentences: int = 2,
287286
similarity_threshold: float = 0.8,
288287
max_chunk_tokens: int = 500,
289288
min_chunk_tokens: int = 150,
@@ -294,7 +293,6 @@ def get_semantic_chunker_skill(
294293
-----
295294
context (str): The context of the skill
296295
source (str): The source of the skill
297-
num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1.
298296
similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8.
299297
max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200.
300298
@@ -314,8 +312,8 @@ def get_semantic_chunker_skill(
314312
name="content", source="/document/layout_merged_content"
315313
),
316314
InputFieldMappingEntry(
317-
name="per_page_starting_sentences",
318-
source="/document/per_page_starting_sentences",
315+
name="page_number_tracking_holders",
316+
source="/document/page_number_tracking_holders",
319317
),
320318
]
321319

@@ -333,7 +331,6 @@ def get_semantic_chunker_skill(
333331
degree_of_parallelism=degree_of_parallelism,
334332
http_method="POST",
335333
http_headers={
336-
"num_surrounding_sentences": num_surrounding_sentences,
337334
"similarity_threshold": similarity_threshold,
338335
"max_chunk_tokens": max_chunk_tokens,
339336
"min_chunk_tokens": min_chunk_tokens,
@@ -385,8 +382,8 @@ def get_layout_analysis_skill(
385382
output = [
386383
OutputFieldMappingEntry(name="layout", target_name="layout"),
387384
OutputFieldMappingEntry(
388-
name="per_page_starting_sentences",
389-
target_name="per_page_starting_sentences",
385+
name="page_number_tracking_holders",
386+
target_name="page_number_tracking_holders",
390387
),
391388
]
392389

image_processing/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ This skill merges the layout output with the figure outputs to create a unified
9898

9999
### Semantic Chunker Custom Skill
100100

101-
You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
101+
You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
102102

103103
### MarkUp Cleaner Custom Skill
104104

image_processing/src/image_processing/function_app.py

-4
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
171171

172172
semantic_text_chunker_config = req.headers
173173

174-
num_surrounding_sentences = int(
175-
semantic_text_chunker_config.get("num_surrounding_sentences", 1)
176-
)
177174
similarity_threshold = float(
178175
semantic_text_chunker_config.get("similarity_threshold", 0.8)
179176
)
@@ -192,7 +189,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
192189
record_tasks = []
193190

194191
semantic_text_chunker_processor = SemanticTextChunker(
195-
num_surrounding_sentences=num_surrounding_sentences,
196192
similarity_threshold=similarity_threshold,
197193
max_chunk_tokens=max_chunk_tokens,
198194
min_chunk_tokens=min_chunk_tokens,

image_processing/src/image_processing/layout_analysis.py

+32-15
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@
2222
LayoutHolder,
2323
PageWiseContentHolder,
2424
NonPageWiseContentHolder,
25-
PerPageStartingSentenceHolder,
25+
PageNumberTrackingHolder,
2626
)
27+
import re
2728

2829

2930
class StorageAccountHelper:
@@ -341,14 +342,14 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
341342

342343
return page_wise_contents
343344

344-
def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
345+
def create_page_number_tracking_holder(self) -> list[PageNumberTrackingHolder]:
345346
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
346347
347348
Returns:
348349
--------
349350
list: A list of the starting sentence of each page."""
350351

351-
per_page_starting_sentences = []
352+
page_number_tracking_holders = []
352353

353354
for page in self.result.pages:
354355
page_content = self.result.content[
@@ -358,22 +359,38 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde
358359

359360
# Remove any leading whitespace/newlines.
360361
cleaned_content = page_content.lstrip()
361-
# If a newline appears before a period, split on newline; otherwise, on period.
362-
if "\n" in cleaned_content:
363-
first_line = cleaned_content.split("\n", 1)[0]
364-
elif "." in cleaned_content:
365-
first_line = cleaned_content.split(".", 1)[0]
362+
# Strip the html comment but keep the content
363+
html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
364+
cleaned_content = html_comments_pattern.sub("", cleaned_content)
365+
366+
# Remove anything inside a figure tag
367+
cleaned_content = re.sub(
368+
"<figure>(.*?)</figure>",
369+
"",
370+
cleaned_content,
371+
flags=re.DOTALL | re.MULTILINE,
372+
)
373+
logging.info(f"Page Number: {page.page_number}")
374+
logging.info(f"Content for Page Detection: {page_content}")
375+
logging.info(f"Cleaned Content for Page Detection: {cleaned_content}")
376+
377+
if len(cleaned_content) == 0:
378+
logging.error(
379+
"No content found in the cleaned result for page %s.",
380+
page.page_number,
381+
)
382+
cleaned_content = None
366383
else:
367-
first_line = cleaned_content
384+
cleaned_content = cleaned_content.strip()
368385

369-
per_page_starting_sentences.append(
370-
PerPageStartingSentenceHolder(
386+
page_number_tracking_holders.append(
387+
PageNumberTrackingHolder(
371388
page_number=page.page_number,
372-
starting_sentence=first_line.strip(),
389+
page_content=cleaned_content,
373390
)
374391
)
375392

376-
return per_page_starting_sentences
393+
return page_number_tracking_holders
377394

378395
async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
379396
"""Get the Azure Document Intelligence client.
@@ -522,11 +539,11 @@ async def analyse(self):
522539
if self.extract_figures:
523540
await self.process_figures_from_extracted_content(text_content)
524541

525-
per_page_starting_sentences = self.create_per_page_starting_sentence()
542+
page_number_tracking_holders = self.create_page_number_tracking_holder()
526543

527544
output_record = NonPageWiseContentHolder(
528545
layout=text_content,
529-
per_page_starting_sentences=per_page_starting_sentences,
546+
page_number_tracking_holders=page_number_tracking_holders,
530547
)
531548

532549
except Exception as e:

image_processing/src/image_processing/layout_holders.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,18 @@ class PageWiseContentHolder(BaseModel):
4747
page_wise_layout: list[LayoutHolder]
4848

4949

50-
class PerPageStartingSentenceHolder(BaseModel):
50+
class PageNumberTrackingHolder(BaseModel):
5151
"""A class to hold the starting sentence of each page."""
5252

5353
page_number: int
54-
starting_sentence: str
54+
page_content: str | None
5555

5656

5757
class NonPageWiseContentHolder(BaseModel):
5858
"""A class to hold the non-page-wise content extracted from the document."""
5959

6060
layout: LayoutHolder
61-
per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
61+
page_number_tracking_holders: list[PageNumberTrackingHolder] = Field(
6262
default_factory=list
6363
)
6464

@@ -69,6 +69,5 @@ class ChunkHolder(BaseModel):
6969
mark_up: str
7070
sections: Optional[list[str]] = Field(default_factory=list)
7171
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
72-
starting_sentence: Optional[str] = None
7372
cleaned_text: Optional[str] = None
7473
page_number: Optional[int] = Field(default=None)

0 commit comments

Comments
 (0)