chore: update something
Browse files- docsifer/service.py +27 -29
docsifer/service.py
CHANGED
|
@@ -131,37 +131,32 @@ class DocsiferService:
|
|
| 131 |
|
| 132 |
logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
# Perform HTML cleanup if requested.
|
| 156 |
-
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
| 157 |
-
self._maybe_cleanup_html(tmp_path)
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
|
|
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
print(f"Filename: {filename}, Source: {source}, Content: {xxx}")
|
| 165 |
|
| 166 |
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
| 167 |
if openai_config and openai_config.get("api_key"):
|
|
@@ -174,6 +169,9 @@ class DocsiferService:
|
|
| 174 |
except Exception as e:
|
| 175 |
logger.error("MarkItDown conversion failed: %s", e)
|
| 176 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
# Count tokens in the resulting markdown text.
|
| 179 |
token_count = self._count_tokens(result_obj.text_content)
|
|
|
|
| 131 |
|
| 132 |
logger.info("Converting file: %s (cleanup=%s)", source, cleanup)
|
| 133 |
|
| 134 |
+
mime_type = magic.from_file(str(src), mime=True)
|
| 135 |
+
guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
|
| 136 |
+
if not mime_type:
|
| 137 |
+
logger.warning(f"Could not detect file type for: {src}")
|
| 138 |
+
new_filename = src.name
|
| 139 |
+
else:
|
| 140 |
+
logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
|
| 141 |
+
new_filename = f"{src.stem}{guessed_ext}"
|
| 142 |
+
tmp_path = src.parent / new_filename
|
| 143 |
+
tmp_path.write_bytes(src.read_bytes())
|
| 144 |
+
src.unlink()
|
| 145 |
+
|
| 146 |
+
logger.info(
|
| 147 |
+
"Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
|
| 148 |
+
tmp_path,
|
| 149 |
+
mime_type,
|
| 150 |
+
guessed_ext,
|
| 151 |
+
tmp_path.exists(),
|
| 152 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# Perform HTML cleanup if requested.
|
| 155 |
+
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
| 156 |
+
self._maybe_cleanup_html(tmp_path)
|
| 157 |
|
| 158 |
+
filename = new_filename
|
| 159 |
+
source = tmp_path
|
|
|
|
| 160 |
|
| 161 |
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
| 162 |
if openai_config and openai_config.get("api_key"):
|
|
|
|
| 169 |
except Exception as e:
|
| 170 |
logger.error("MarkItDown conversion failed: %s", e)
|
| 171 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
| 172 |
+
|
| 173 |
+
if isinstance(source, Path) and source.exists():
|
| 174 |
+
source.unlink()
|
| 175 |
|
| 176 |
# Count tokens in the resulting markdown text.
|
| 177 |
token_count = self._count_tokens(result_obj.text_content)
|