leer fichero con codificacion utf-8

Manolait · #1 (**permalink**) 20/05/2013, 11:25

Hola, tengo un problema al leer los ficheros de un directorio.

cuento mi direcctorio como es. tengo una carpeta documentos con 400 subcarpetas desde la 0 a la 400 y dentro de cada subcarpeta 5 archivos .txt siempre los mismos con los mismos nombres, por eso uso el si contiene bold pues lista[0].

asi recorro mi directorio:

Código HTML:

/*creamos una lista de ficheros
y luego para cada fichero creamos un string por cada uno
de los ficheros de un directorio*/
	private static ArrayList []lista;
	
	public static void crearLista (File f) {
		if (f.isFile() && f.getName().endsWith(".txt")) {
			if (f.getName().contains("bold"))
				lista[0].add(f.getAbsolutePath());
			else {
				if (f.getName().contains("text_ES"))
					lista[1].add(f.getAbsolutePath());
				else {
					if (f.getName().contains("text_IMG"))
						lista[2].add(f.getAbsolutePath());
					else {
						if (f.getName().contains("title"))
							lista[3].add(f.getAbsolutePath());
						else 
							lista[4].add(f.getAbsolutePath());
					}
				}
			}
			return ;
		}
		if (f.isDirectory()) {
			String[] hijos = f.list();
			for (int i=0; i<hijos.length; i++) {
				crearLista (new File (f, hijos[i]));
			}
		}
	}

y asi linea por linea leo lo que tiene.

Código HTML:

public static String leerFichero (File f) {//Éste código sirve para extraer las palabras reservadas de un archivo de texto como nuestro código fuente y guardar cada palabra en un Vector. El Vector actuará como un arreglo, pero de tamaño indefinido excepto por la cantidad de elementos que se introduzcan en él.
		StringBuffer s = new StringBuffer ();
		try {
			BufferedReader br = new BufferedReader (new FileReader(f));
			
			String cad = br.readLine();
			while (cad != null) {
				s.append(cad);
				s.append(" ");
				cad = br.readLine();
			}
			br.close();
		}
		catch (Exception e) {
		}
		return s.substring(0);
	}

necesito que al leer del directorio la carpeta 0 lo que hay dentro que son .txt lo leea con codificacion utf-8. he leido que es cambiando esta linea:

Código HTML:

BufferedReader br = new BufferedReader (new FileReader(f));

por esta.

Código HTML:

//BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("archivo.txt"), "utf-8"));

pero claro es archivo.txt y no toda mi f.

este es mi codigo completo x si falta algo que aclarar.

Código HTML:

package uas;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.document.*;
import java.io.*;
import java.util.ArrayList;

public class CreaIndice {
	
	
	
	/*creamos una lista de ficheros
y luego para cada fichero creamos un string por cada uno
de los ficheros de un directorio*/
	private static ArrayList []lista;
	
	public static void crearLista (File f) {
		if (f.isFile() && f.getName().endsWith(".txt")) {
			if (f.getName().contains("bold"))
				lista[0].add(f.getAbsolutePath());
			else {
				if (f.getName().contains("text_ES"))
					lista[1].add(f.getAbsolutePath());
				else {
					if (f.getName().contains("text_IMG"))
						lista[2].add(f.getAbsolutePath());
					else {
						if (f.getName().contains("title"))
							lista[3].add(f.getAbsolutePath());
						else 
							lista[4].add(f.getAbsolutePath());
					}
				}
			}
			return ;
		}
		if (f.isDirectory()) {
			String[] hijos = f.list();
			for (int i=0; i<hijos.length; i++) {
				crearLista (new File (f, hijos[i]));
			}
		}
	}
	
	public static String leerFichero (File f) {//Éste código sirve para extraer las palabras reservadas de un archivo de texto como nuestro código fuente y guardar cada palabra en un Vector. El Vector actuará como un arreglo, pero de tamaño indefinido excepto por la cantidad de elementos que se introduzcan en él.
		StringBuffer s = new StringBuffer ();
		try {
			BufferedReader br = new BufferedReader (new FileReader(f));
			//BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("c:\\Temp\\documentos\\0\\title_ES.txt"), "utf-8"));
			String cad = br.readLine();
			while (cad != null) {
				s.append(cad);
				s.append(" ");
				cad = br.readLine();
			}
			br.close();
		}
		catch (Exception e) {
		}
		return s.substring(0);
	}

	public static void main(String[] args) throws Exception {
		
		lista  = new ArrayList[5];
		for (int i=0; i<5; i++)
			lista[i] = new ArrayList();
		
		File directorioGuardarIndice = new File("c:\\Temp\\indice");//crear carpeta indice
		
		File Documentos = new File("c:\\Temp\\documentos");//crear carpeta con los documentos a indexar
		
		
		crearLista (Documentos);
		/*for (int n=0; n<5; n++) {
			System.out.println("Lista: "+(n+1));
			for (int i=0; i<lista[n].size(); i++)
				System.out.println(lista[n].get(i));
		}*/
		
		
		Analyzer analizador = new SpanishAnalyzer(Version.LUCENE_31);//analizador
		
		//Analyzer analizador = new StandardAnalyzer(Version.LUCENE_31);
		

		IndexWriterConfig configuracionIndice = new IndexWriterConfig(
				Version.LUCENE_31, analizador);
		

		IndexWriter Indice = new IndexWriter(FSDirectory.open(directorioGuardarIndice), configuracionIndice);
		
		//IndexWriter Indice = new IndexWriter(RecorreDirectorio, analizador, true, IndexWriter.MaxFieldLength.LIMITED);
		
		String bold;
        String text;
        String textIMG;
        String title;
        String underline;
        
        String id;
        
        int num = lista[0].size(); //directorios
        System.out.println("Directorios buscados: "+num);
        for (int i=0; i<num; i++) {
        		id = num+"";
        		bold = leerFichero(new File ((String)lista[0].get(i)));
        		text = leerFichero(new File ((String)lista[1].get(i)));
        		textIMG = leerFichero(new File ((String)lista[2].get(i)));
        		title = leerFichero(new File ((String)lista[3].get(i)));
        		underline = leerFichero(new File ((String)lista[4].get(i)));
        		System.out.println("--------------------bold "+bold);
        		System.out.println("--------------------text"+text);
        		System.out.println("-------------------textimg"+textIMG);
        		System.out.println("-------------------title"+title);
        		System.out.println("-------------------underline"+underline);
        		Document doc = new Document();
        		
        		// Campo bold
				Field campobold = new Field("bold", bold,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campobold);				
				// Campo text
        		Field campocontenido = new Field("contenido", text,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campocontenido);
				// Campo textimg
				Field campotextimg = new Field("textimg", textIMG,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campotextimg);
				// Campo title
				Field campotitle = new Field("title", title,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campotitle);
				// Campo underlineWords
				Field campounderlineWords = new Field("underlineWords", underline,Field.Store.YES, Field.Index.ANALYZED);
				doc.add(campounderlineWords);
				
				
				        		
        		Indice.addDocument(doc);
        		
        }
	
        
        /********************************************************************/
              
				
				/*
				 Campo nombre archivo
				Field camponombre = new Field("rutaArchivo", new InputStreamReader(new FileInputStream(f), "UTF-8"));
				
				*/
					
		Indice.optimize();
		Indice.close();
		System.out.println("el numero de documentos indexados es "
				+ Indice.numDocs());
	}

	
}

estoy atascado y no se seguir ayuda

Xerelo · #2 (**permalink**) 20/05/2013, 12:24

¿Y cuál es es problema?

f es un File y de ahí puedes sacar el nombre del fichero, el directorio padre, su ruta absoluta o lo que quieras.

Manolait · #3 (**permalink**) 20/05/2013, 12:45

ya esta arreglado el problema eran mis .txt k son copiados del dropbox y no se xk mete una ? x la cara al empezar. me lo creado otra vez y bien:)