Lucene no analiza para indexar

Manolait · #1 (**permalink**) 17/05/2013, 04:56

hola, tengo un problema con lucene y su analizador si scribo yo la informacion del string funciona sin problemas.
ej

Código HTML:

String bolde = "prueba con el titulo acné  sábado dolores.";
			    doc.add(new Field("bolde", bolde, Field.Store.YES, Field.Index.ANALYZED));

si lo compruebo con luke esta todo ok.

aqui viene el problema si leo un fichero no lo analiza le dejo mi codigo x si tengo un fallo.

Código HTML:

import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.document.*;
import java.io.*;
import java.util.ArrayList;

public class CreaIndice {
	
	
	
	/*creamos una lista de ficheros
y luego para cada fichero creamos un string por cada uno
de los ficheros de un directorio*/
	private static ArrayList []lista;
	
	public static void crearLista (File f) {
		if (f.isFile() && f.getName().endsWith(".txt")) {
			if (f.getName().contains("bold"))
				lista[0].add(f.getAbsolutePath());
			else {
				if (f.getName().contains("text_ES"))
					lista[1].add(f.getAbsolutePath());
				else {
					if (f.getName().contains("text_IMG"))
						lista[2].add(f.getAbsolutePath());
					else {
						if (f.getName().contains("title"))
							lista[3].add(f.getAbsolutePath());
						else 
							lista[4].add(f.getAbsolutePath());
					}
				}
			}
			return ;
		}
		if (f.isDirectory()) {
			String[] hijos = f.list();
			for (int i=0; i<hijos.length; i++) {
				crearLista (new File (f, hijos[i]));
			}
		}
	}
	
	public static String leerFichero (File f) {//Éste código sirve para extraer las palabras reservadas de un archivo de texto como nuestro código fuente y guardar cada palabra en un Vector. El Vector actuará como un arreglo, pero de tamaño indefinido excepto por la cantidad de elementos que se introduzcan en él.
		StringBuffer s = new StringBuffer ();
		try {
			BufferedReader br = new BufferedReader (new FileReader(f));
			String cad = br.readLine();
			while (cad != null) {
				s.append(cad);
				s.append(" ");
				cad = br.readLine();
			}
			br.close();
		}
		catch (Exception e) {
		}
		return s.substring(0);
	}

	public static void main(String[] args) throws Exception {
		
		lista  = new ArrayList[5];
		for (int i=0; i<5; i++)
			lista[i] = new ArrayList();
		
		File directorioGuardarIndice = new File("c:\\Temp\\indice");//crear carpeta indice
		
		File Documentos = new File("c:\\Temp\\documentos");//crear carpeta con los documentos a indexar
		
		
		crearLista (Documentos);
		/*for (int n=0; n<5; n++) {
			System.out.println("Lista: "+(n+1));
			for (int i=0; i<lista[n].size(); i++)
				System.out.println(lista[n].get(i));
		}*/
		
		
		Analyzer analizador = new SpanishAnalyzer(Version.LUCENE_31);//analizador
		
		

		IndexWriterConfig configuracionIndice = new IndexWriterConfig(
				Version.LUCENE_31, analizador);
		

		IndexWriter Indice = new IndexWriter(FSDirectory.open(directorioGuardarIndice), configuracionIndice);
		
		//IndexWriter Indice = new IndexWriter(RecorreDirectorio, analizador, true, IndexWriter.MaxFieldLength.LIMITED);
		
		String bold;
        String text;
        String textIMG;
        String title;
        String underline;
        
        String id;
        
        int num = lista[0].size(); //directorios
        System.out.println("Directorios buscados: "+num);
        for (int i=0; i<num; i++) {
        		id = num+"";
        		bold = leerFichero(new File ((String)lista[0].get(i)));
        		text = leerFichero(new File ((String)lista[1].get(i)));
        		textIMG = leerFichero(new File ((String)lista[2].get(i)));
        		title = leerFichero(new File ((String)lista[3].get(i)));
        		underline = leerFichero(new File ((String)lista[4].get(i)));
        		System.out.println("--------------------bold "+bold);
        		System.out.println("--------------------text"+text);
        		System.out.println("-------------------textimg"+textIMG);
        		System.out.println("-------------------title"+title);
        		System.out.println("-------------------underline"+underline);
        		Document doc = new Document();
        		
        		// Campo bold
				Field campobold = new Field("bold", bold,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campobold);
				// Campo text
        		Field campocontenido = new Field("contenido", text,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campocontenido);
				
				// Campo textimg
				Field campotextimg = new Field("textimg", textIMG,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campotextimg);
				// Campo title
				Field campotitle = new Field("title", title,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campotitle);
				// Campo underlineWords
				Field campounderlineWords = new Field("underlineWords", underline,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campounderlineWords);
				
				

			    
			    String title2 = "recién nacidos, lactantes y adolescentes niños de edades entre los 4 y los 8 años cabeza tórax abdomen pelvis y extremidades Prevención del riesgo de atropello Prevención de lesiones en el interior del vehículo recién nacidos lactantes niños en edad preescolar y escolar Prevención deaccidentes al usar juguetes móviles ó vehículos sin motor";
			    doc.add(new Field("title2", title2, Field.Store.YES,
			    Field.Index.ANALYZED));
        		
        		Indice.addDocument(doc);
        		
        }
	
        
        /********************************************************************/
              
				
		//File[] carpeta = Documentos.listFiles();//asigna a archivos los documentos de la carpeta
		//for (int i = 0; i < carpeta.length; i++) {
			
			//File[] files = Filter().finder(Documentos + "/" + i + "/");
			
			//File[] files = new Filter().finder(Documentos + "/" + i + "/");
			
			//File f = archivos[i];//asigna a f los archivos recorri
			
			/*if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
					&& (f.getName().endsWith(".txt")))*/ //{
				//System.out.println("Indexing " + f.getCanonicalPath());// si no es una carpeta, si no esta oculto, si existe el archivo, si se puede leer y si es un txt
				//escribis por consola la direccion de ubicación del txt
				//Document doc = new Document();
				
				//bold = LeerFicheroTexto(files[0].getPath());
			    /*text = leerFichero(files[1].getPath());
			    textIMG = leerFichero(files[2].getPath());
			    title = leerFichero(files[3].getPath());
			    underline = leerFichero(files[4].getPath());*/
				
				//Campo bold
				//Field campobold = new Field("bold", bold,Field.Store.YES, Field.Index.ANALYZED);
				//doc.add(campobold);
				//bold = leerFichero(File[0]);
                /*text = leerFichero(File[1]);
                textIMG = leerFichero(File[2]);
                title = leerFichero(File[3]);
                underline = leerFichero(File[4]);
				
                

				
				String bolde = "This is the text to be indexed.";
			    doc.add(new Field("bolde", bolde, Field.Store.YES,
			    Field.Index.ANALYZED));
			    
			    String title2 = "prueba con el titulo acné  sábado dolores";
			    doc.add(new Field("title2", title2, Field.Store.YES,
			    Field.Index.ANALYZED));
				
			  
				
				/*Field campocontenido = new Field("contenido", new FileReader(f));
                doc.add(campocontenido);*/
				
				/*
				 Campo nombre archivo
				Field camponombre = new Field("rutaArchivo", new InputStreamReader(new FileInputStream(f), "UTF-8"));
				//doc.add(camponombre);
				Field camponombres = new Field("rutaArchivo", f.getName(),Field.Store.YES, Field.Index.ANALYZED);
				doc.add(camponombres);
					
				
				
				
				*/
					
		Indice.optimize();
		Indice.close();
		System.out.println("el numero de documentos indexados es "
				+ Indice.numDocs());
	}

	
}

Xerelo · #2 (**permalink**) 17/05/2013, 08:02

Prueba con el debugger