Pipeline Explorer
Click a step
Read
Parse
Summarize
Save
Read: handle weird encodings safely
# ====== 1) READ RAW BYTES + DECODE SAFELY ======
raw = open(INPUT_PATH, "rb").read()
print("First 80 bytes:", raw[:80])
text = None
enc_used = None
for enc in ("utf-8-sig", "utf-16", "utf-16le", "utf-16be", "cp1252", "latin1"):
try:
candidate = raw.decode(enc)
if re.sub(r"\s+", "", candidate):
text = candidate
enc_used = enc
break
except Exception:
continue
if text is None:
raise ValueError(
"Could not decode car_reviews.csv into readable text. "
"Try re-saving as 'CSV UTF-8 (Comma delimited)' from Excel."
)
print("Decoded with:", enc_used)
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]