Shami96 commited on
Commit
88a026a
·
verified ·
1 Parent(s): f1bab1c

Rename word_updater.py to word_extractor.py

Browse files
Files changed (2) hide show
  1. word_extractor.py +61 -0
  2. word_updater.py +0 -22
word_extractor.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # word_extractor.py
2
+ from docx import Document
3
+ from docx.shared import RGBColor
4
+ from collections import defaultdict
5
+ from typing import List, Dict
6
+
7
+
8
+ def is_red_font(run) -> bool:
9
+ if run.font.color and run.font.color.rgb:
10
+ rgb = run.font.color.rgb
11
+ r, g, b = rgb[0], rgb[1], rgb[2]
12
+ return r > 150 and g < 100 and b < 100
13
+ return False
14
+
15
+
16
+ def get_full_text_if_red(para):
17
+ buffer = ""
18
+ collecting = False
19
+ red_texts = []
20
+
21
+ for run in para.runs:
22
+ if is_red_font(run):
23
+ buffer += run.text
24
+ collecting = True
25
+ elif collecting:
26
+ red_texts.append(buffer.strip())
27
+ buffer = ""
28
+ collecting = False
29
+ if buffer:
30
+ red_texts.append(buffer.strip())
31
+ return red_texts
32
+
33
+
34
+ def extract_red_text_with_labels(doc_path: str) -> Dict[str, List[str]]:
35
+ document = Document(doc_path)
36
+ results = defaultdict(list)
37
+
38
+ for para in document.paragraphs:
39
+ red_texts = get_full_text_if_red(para)
40
+ for text in red_texts:
41
+ if text.strip():
42
+ results["Unlabeled"].append(text)
43
+
44
+ for table_idx, table in enumerate(document.tables):
45
+ for row_idx, row in enumerate(table.rows):
46
+ cells = row.cells
47
+ if len(cells) >= 2:
48
+ label = cells[0].text.strip().replace(":", "").replace("\n", " ")
49
+ values = []
50
+ for para in cells[1].paragraphs:
51
+ values += get_full_text_if_red(para)
52
+ if values:
53
+ clean_label = label if label else f"Table_{table_idx+1}_Row_{row_idx+1}"
54
+ for v in values:
55
+ results[clean_label].append(v)
56
+ elif len(cells) == 1:
57
+ for para in cells[0].paragraphs:
58
+ red_texts = get_full_text_if_red(para)
59
+ for text in red_texts:
60
+ results[f"Single_Column_Table_{table_idx+1}"].append(text)
61
+ return results
word_updater.py DELETED
@@ -1,22 +0,0 @@
1
- # word_updater.py
2
-
3
- from docx import Document
4
- from docx.shared import RGBColor
5
-
6
- def is_red(run):
7
- color = run.font.color
8
- return color and color.rgb == RGBColor(255, 0, 0)
9
-
10
- def fill_template_with_data(template_path, output_path, data):
11
- doc = Document(template_path)
12
-
13
- for para in doc.paragraphs:
14
- for run in para.runs:
15
- if is_red(run):
16
- for key in data:
17
- if key.lower() in run.text.lower():
18
- run.text = run.text.replace(run.text, data[key])
19
- break
20
-
21
- doc.save(output_path)
22
- return output_path