buscando por internet encontre esta clase PHP que lee un pdf y convierte a texto pero de vez en cuando falla y no hace bien los saltos de linea y mete caracteres raros,
(no se si es porque con los pdfs que trabajo son archivos muy grandes: de mas de 80 paginas)
Código PHP:
Ver original<?php
class PDF2Text {
// Some settings
var $multibyte = 2; // Use setUnicode(TRUE|FALSE)
var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
// Variables
var $filename = '';
var $decodedtext = '';
function setFilename($filename) {
// Reset
$this->decodedtext = '';
$this->filename = $filename;
}
function output($echo = false) {
if($echo) echo $this->decodedtext;
else return $this->decodedtext;
}
function setUnicode($input) {
// 4 for unicode. But 2 should work in most cases just fine
if($input == true) $this->multibyte = 4;
else $this->multibyte = 2;
}
function decodePDF() {
// Read the data from pdf file
return "";
// Get all text data.
$transformations = array();
// Get the list of all objects.
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); $objects = @$objects[1];
// Select objects with streams.
for ($i = 0; $i < count($objects); $i++) { $currentObject = $objects[$i];
// Check if an object includes data stream.
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { $stream = ltrim($stream[1]);
// Check object parameters and look for text data.
$options = $this->getObjectOptions($currentObject);
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) continue;
// Hack, length doesnt always seem to be correct
unset($options["Length"]);
// So, we have text data. Decode it.
$data = $this->getDecodedStream($stream, $options);
if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { $textContainers = @$textContainers[1];
$this->getDirtyTexts($texts, $textContainers);
} else
$this->getCharTransformations($transformations, $data);
}
}
}
// Analyze text blocks taking into account character transformations and return results.
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations);
}
function decodeAsciiHex($input) {
$output = "";
$isOdd = true;
$isComment = false;
for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { $c = $input[$i];
if($isComment) {
if ($c == '\r' || $c == '\n')
$isComment = false;
continue;
}
switch($c) {
case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
case '%':
$isComment = true;
break;
default:
if($code === 0 && $c != '0')
return "";
if($isOdd)
$codeHigh = $code;
else
$output .= chr($codeHigh * 16 + $code);
$isOdd = !$isOdd;
break;
}
}
if($input[$i] != '>')
return "";
if($isOdd)
$output .= chr($codeHigh * 16);
return $output;
}
function decodeAscii85($input) {
$output = "";
$isComment = false;
for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { $c = $input[$i];
if($isComment) {
if ($c == '\r' || $c == '\n')
$isComment = false;
continue;
}
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
continue;
if ($c == '%') {
$isComment = true;
continue;
}
if ($c == 'z' && $state === 0) {
continue;
}
if ($c < '!' || $c > 'u')
return "";
$code = ord($input[$i]) & 0xff; $ords[$state++] = $code - ord('!');
if ($state == 5) {
$state = 0;
for ($sum = 0, $j = 0; $j < 5; $j++)
$sum = $sum * 85 + $ords[$j];
for ($j = 3; $j >= 0; $j--)
$output .= chr($sum >> ($j * 8)); }
}
if ($state === 1)
return "";
elseif ($state > 1) {
for ($i = 0, $sum = 0; $i < $state; $i++)
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); for ($i = 0; $i < $state - 1; $i++)
$ouput .= chr($sum >> ((3 - $i) * 8)); }
return $output;
}
function decodeFlate($input) {
}
function getObjectOptions($object) {
if (preg_match("#<<(.*)>>#ismU", $object, $options)) { $options = explode("/", $options[1]);
for ($j = 0; $j < @count($options); $j++) { if (strpos($options[$j], " ") !== false) { $parts = explode(" ", $options[$j]); $o[$parts[0]] = $parts[1];
} else
$o[$options[$j]] = true;
}
$options = $o;
}
return $options;
}
function getDecodedStream($stream, $options) {
$data = "";
if (empty($options["Filter"])) $data = $stream;
else {
$length = !empty($options["Length"]) ?
$options["Length"] : strlen($stream); $_stream = substr($stream, 0, $length);
foreach ($options as $key => $value) {
if ($key == "ASCIIHexDecode")
$_stream = $this->decodeAsciiHex($_stream);
if ($key == "ASCII85Decode")
$_stream = $this->decodeAscii85($_stream);
if ($key == "FlateDecode")
$_stream = $this->decodeFlate($_stream);
if ($key == "Crypt") { // TO DO
}
}
$data = $_stream;
}
return $data;
}
function getDirtyTexts(&$texts, $textContainers) {
for ($j = 0; $j < count($textContainers); $j++) { if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) }
}
function getCharTransformations(&$transformations, $stream) {
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER
); preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER
);
for ($j = 0; $j < count($chars); $j++) { $count = $chars[$j][1];
for ($k = 0; $k < $count && $k < count($current); $k++) { if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) $transformations[str_pad($map[1], 4, "0")] = $map[2]; }
}
for ($j = 0; $j < count($ranges); $j++) { $count = $ranges[$j][1];
for ($k = 0; $k < $count && $k < count($current); $k++) { if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) }
}
}
}
function getTextUsingTransformations($texts, $transformations) {
$document = "";
for ($i = 0; $i < count($texts); $i++) { $isHex = false;
$isPlain = false;
$hex = "";
$plain = "";
for ($j = 0; $j < strlen($texts[$i]); $j++) { $c = $texts[$i][$j];
switch($c) {
case "<":
$hex = "";
$isHex = true;
break;
case ">":
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) for ($k = 0; $k < count($hexs); $k++) { $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero if (isset($transformations[$chex])) $chex = $transformations[$chex];
}
$isHex = false;
break;
case "(":
$plain = "";
$isPlain = true;
break;
case ")":
$document .= $plain;
$isPlain = false;
break;
case "\\":
$c2 = $texts[$i][$j + 1];
elseif ($c2 == "n") $plain .= '\n';
elseif ($c2 == "r") $plain .= '\r';
elseif ($c2 == "t") $plain .= '\t';
elseif ($c2 == "b") $plain .= '\b';
elseif ($c2 == "f") $plain .= '\f';
elseif ($c2 >= '0' && $c2 <= '9') {
}
$j++;
break;
default:
if ($isHex)
$hex .= $c;
if ($isPlain)
$plain .= $c;
break;
}
}
$document .= "\n";
}
return $document;
}
}
?>
para visualizar el pdf tenemos que incluir y llamar a la clase tal que asi
Código PHP:
Ver originalinclude('class.pdf2text.php');
$a = new PDF2Text();
$path = $_SERVER['DOCUMENT_ROOT']."LectorPDF/pdfs/21.pdf";
$a->setFilename($path);
$a->decodePDF();
echo $a->output();
en el echo visualizo el texto por pantalla, lo malo es que cuando llega a algunos saltos de linea (no en todos) decodifica mal y mete caracteres raros. no se a que puede ser debido
si alguien se le da por probarlo y me puede decir si es por culpa de mi equipo o tambien os pasa a vosotros os lo agradezo