S1005403_RisCC/tools/extract_pdf_commands.py

58 lines
1.6 KiB
Python

import sys
from pathlib import Path
p = Path('SUM7056227 Rev. A.pdf')
if not p.exists():
print('PDF not found at', p)
sys.exit(2)
# Try multiple PDF libraries
reader = None
try:
from pypdf import PdfReader
reader = PdfReader(str(p))
except Exception:
try:
import PyPDF2
reader = PyPDF2.PdfReader(str(p))
except Exception as e:
print('No suitable PDF reader installed:', e)
sys.exit(3)
text = []
for i,pg in enumerate(reader.pages):
try:
t = pg.extract_text() or ''
except Exception:
t = ''
text.append(t)
full = '\n'.join(text)
# search for relevant keywords
keywords = ['tgtinit', 'tgtset', 'tgtreset', 'command', 'parameters', 'format']
found = False
for kw in keywords:
idx = full.lower().find(kw)
if idx != -1:
found = True
start = max(0, idx-200)
end = min(len(full), idx+400)
ctx = full[start:end]
print('\n--- context around "{}" ---\n'.format(kw))
print(ctx)
if not found:
# fallback: print first 3000 chars for manual inspection
print('\n--- No keywords found; printing first 3000 chars of PDF text ---\n')
print(full[:3000])
sys.exit(0)
else:
# Also print the specific pages around TOC entries (38-41) for clarity
print('\n--- Explicitly printing pages 38-41 ---\n')
for pi in range(max(0, 38-1), min(len(reader.pages), 41)):
print(f'--- PAGE {pi+1} ---\n')
try:
print(reader.pages[pi].extract_text() or '')
except Exception as e:
print('ERROR extracting page', pi+1, e)
sys.exit(0)