您的位置:首页 > 博客中心 > 编程语言 >

java根据标点英文分词

时间:2022-03-21 06:22

最近学习java字符串部分,用正则表达式做了一个简单的统计单词出现次数的小程序,目前只能统计英文。 整个程序包括三个包,分别为output,run,wordcount wordCount包 执行单词统计逻辑的工具包,使用HashMap存储某个字符串出现的次数。 setPattern用来在类外部设置不同的正则表达式,从而使用不同的分词规则(策略模式的一个变种吧),默认使用[a-zA-Z]+ getMap返回存储统计结果的map,map用来给输出器(outputProcesser)将结果输出到控制台或文件。 源代码:   package wordCount;   import java.util.*; import java.util.Map.*; import java.util.regex.*;   public class WordCount{ private String pattern = new String("([a-zA-Z]+)"); private HashMap map;   public void setPatterm(String p){ this.pattern = p; }   public Map getMap(){ return this.map; }   public void count(String str){ this.map = new HashMap(); Matcher matcher = Pattern.compile(this.pattern).matcher(str);   String key; while ( matcher.find() ){ key = matcher.group(); if ( this.map.containsKey(key) ){ this.map.put(key, this.map.get(key) + 1); }else { this.map.put(key, 1); } } } }   output包 这个包包括了三个类,OutputProcesser,ConsoleOutput,FileOutput,其中OutputProcesser作为基类   OutputProcesser.java 构造器接收一个来自wordCount返回的map processInternal()申明为抽象方法,不同的子类实现不同输出方式 output()作为外部调用的接口,接口会循环map,给processInternal提供entry,processInternal会根据提供的entry进行自己的输出 beforeOutput()在输出循环开始前调用 afterOutput()在输出循环结束后调用,这两个方法使用了模板设计模式,用于子类进行输出准备和结束操作,如输出到文件时,在循环开始前打开文件,循环结束之后关闭文件 package output;   import java.util.Iterator; import java.util.Map; import java.util.Map.Entry;   public abstract class OutputProcesser { private Map map;   public OutputProcesser(Map map){ this.map = map; }   public void output(){ if ( this.beforeOutput(this.map) ){ Iterator> iterator = this.map.entrySet().iterator();   while ( iterator.hasNext() ){ this.processInternal(iterator.next()); } this.afterOutput(this.map); } }   protected boolean beforeOutput(Map map){ return true; }   protected void afterOutput(Map map){ }   abstract protected void processInternal(Entry entry); }   ConsoleOutput.java package output;   import java.util.Map.Entry; import java.util.*;   public class ConsoleOutput extends OutputProcesser{ public ConsoleOutput(Map map) { super(map); }   protected void processInternal(Entry entry){ System.out.println(this.logString(entry)); }   protected String logString(Entry entry){ return entry.getKey()+" : "+entry.getValue()+" times"; } }   FileOutput.java package output;   import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.Map; import java.util.Map.Entry;   public class FileOutput extends ConsoleOutput{ private String filePath = "result.txt"; private FileWriter fw; private PrintWriter pw; public FileOutput(Map map) { super(map); } protected boolean beforeOutput(Map map){ try { this.fw = new FileWriter(this.filePath); this.pw = new PrintWriter(this.fw); } catch (IOException e) { System.out.println("IOException before process output"); } return true; } protected void afterOutput(Map map){ try { this.pw.close(); this.fw.close(); } catch (IOException e) { System.out.println("IOException after process output"); } }   protected void processInternal(Entry entry){ this.pw.println(this.logString(entry)); } }   run包 该包对wordCount进行测试,并调用输出类,同时进行简单的性能测试(使用内存和耗时)。 首先会打开一个文本,并读入内存,将文本交给wordCount进行处理 package run;   import java.io.File; import java.io.FileInputStream; import java.util.logging.ConsoleHandler; import org.omg.SendingContext.RunTime;   import wordCount.WordCount; import output.*;   public class Run { private long totalMemory = 0,time1 = 0,time2 = 0,memoryUsage = 0; public void beginProfile(){ this.totalMemory = Runtime.getRuntime().totalMemory(); this.time1 = this.time2 = System.currentTimeMillis(); } public void endProfile(){ this.memoryUsage = this.totalMemory - Runtime.getRuntime().freeMemory(); this.time2 = System.currentTimeMillis(); System.out.println("memory usage:"+this.memoryUsage+" B"); System.out.println("time usage:"+(this.time2 - this.time1)+"ms"); } public String readFromFile(String filePath){ File file = new File(filePath); Long fLength = file.length(); byte[] content = new byte[fLength.intValue()]; try { FileInputStream input = new FileInputStream(file); input.read(content); input.close(); } catch (Exception e) { }   return new String(content); } public static void main(String[] args) { Run run = new Run(); WordCount wordCount = new WordCount(); run.beginProfile(); wordCount.count(run.readFromFile("messages.txt")); run.endProfile(); OutputProcesser out = new FileOutput(wordCount.getMap()); out.output(); }   }   下面是简单的性能测试结果 gxlsystem.com,gxl网

java根据标点英文分词,gxlsystem

本类排行

今日推荐

热门手游