S1005403_RisCC/tools/extract_pdf_commands.py

61 lines
1.6 KiB
Python

import sys
from pathlib import Path
p = Path("SUM7056227 Rev. A.pdf")
if not p.exists():
print("PDF not found at", p)
sys.exit(2)
# Try multiple PDF libraries
reader = None
try:
from pypdf import PdfReader
reader = PdfReader(str(p))
except Exception:
try:
import PyPDF2
reader = PyPDF2.PdfReader(str(p))
except Exception as e:
print("No suitable PDF reader installed:", e)
sys.exit(3)
text = []
for i, pg in enumerate(reader.pages):
try:
t = pg.extract_text() or ""
except Exception:
t = ""
text.append(t)
full = "\n".join(text)
# search for relevant keywords
keywords = ["tgtinit", "tgtset", "tgtreset", "command", "parameters", "format"]
found = False
for kw in keywords:
idx = full.lower().find(kw)
if idx != -1:
found = True
start = max(0, idx - 200)
end = min(len(full), idx + 400)
ctx = full[start:end]
print('\n--- context around "{}" ---\n'.format(kw))
print(ctx)
if not found:
# fallback: print first 3000 chars for manual inspection
print("\n--- No keywords found; printing first 3000 chars of PDF text ---\n")
print(full[:3000])
sys.exit(0)
else:
# Also print the specific pages around TOC entries (38-41) for clarity
print("\n--- Explicitly printing pages 38-41 ---\n")
for pi in range(max(0, 38 - 1), min(len(reader.pages), 41)):
print(f"--- PAGE {pi+1} ---\n")
try:
print(reader.pages[pi].extract_text() or "")
except Exception as e:
print("ERROR extracting page", pi + 1, e)
sys.exit(0)