`
orange.lpai
  • 浏览: 89103 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

互联网时代的社会语言学:基于SNS的文本数据挖掘

阅读更多
互联网时代的社会语言学:基于SNS的文本数据挖掘
本文转载于http://www.matrix67.com/blog/archives/5044

几个概念


凝固度

我们定义“电影院”的凝合程度就是 p(电影院) 与 p(电) · p(影院) 比值和 p(电影院) 与 p(电影) · p(院) 的比值中的较小值,“的电影”的凝合程度则是 p(的电影) 分别除以 p(的) · p(电影) 和 p(的电) · p(影) 所得的商的较小值。

自由度
我们不妨就把一个文本片段的自由运用程度定义为它的左邻字信息熵和右邻字信息熵中的较小值。

java实现,100M文本效果还可以,但大于100M以后内存会溢出


public class FindWordsByWordArray {  
  
    private final static ResourceBundle resourceBundle = ResourceBundle.getBundle("finder");  
  
    private Map<String, Word> wordsMap = new HashMap<String, Word>();  
  
    private int wordMaxLen = 5;  
    private double allTextLen = 0;  
    private double allDomSize = 0;  
    private double mutualInformationPunish = 0.5;  
    private double leftAndRightEntropyPunish = 1;  
    private double wholePunish = 10;  
  
    public FindWordsByWordArray() {  
    }  
  
    public FindWordsByWordArray(long num) {  
        this.allDomSize = num;  
    }  
  
    public static long pretreatment(File input, File output) throws IOException {  
  
        if(output.exists())  
            FileUtils.deleteQuietly(output);  
  
        LineIterator list = FileUtils.lineIterator(input, "utf-8");  
  
        List<String> res = new ArrayList<String>();  
  
        long num = 0;  
        for (String text = list.next(); list.hasNext(); text = list.next()) {  
              
            num++;  
              
            res.addAll(pretreatment(text));  
  
            if(res.size() > 500000) {  
                FileUtils.writeLines(output, res, true);  
                System.out.println("write lines 500000.");  
                res.clear();  
            }  
  
        }  
        list.close();  
  
        if(res.size() > 0)  
            FileUtils.writeLines(output, res, true);  
  
        System.out.println("pretreatment over.");  
        return num;  
    }  
  
    private static List<String> pretreatment(String... texts) {  
        List<String> res = new ArrayList<String>();  
          
        for(String text: texts)  {  
            text = text.toLowerCase().replaceAll("\\d", "N")  
                    .replaceAll("(\\p{P}|\\s+|&[a-zA-Z]*;|[a-zA-z]+://[^\\s]*|~|~|★)", "#")  
                    .replace('.', '#')  
                    .replace('+', '#')  
                    .replace('|', '#')  
                    .replace('>', '#');  
  
            for (String some : text.split("#")) {  
                if (some.length() < 5)  
                    continue;  
                res.add(some);  
            }  
        }  
        return res;  
    }  
  
    public void parse(boolean needPretreatment, String... texts) {  
        if(needPretreatment) {  
            allDomSize += texts.length;  
            parse(false, pretreatment(texts));  
            return;  
        }  
        for (String text : texts) {  
            if (text.matches("^[a-zA-Z]*")) {  
                parseEnglish(text);  
                allTextLen += 1;  
            }else {  
                parseChinese(text);  
                allTextLen += text.length();  
            }  
        }  
    }  
  
    private void parseEnglish(String text) {  
        addEnglishWord(text);  
    }  
  
    private void parseChinese(String text) {  
  
        WordArray wordArray = new WordArray(text);  
        String left = null;  
        int thisWordMaxLen = wordMaxLen;  
  
        for (int index = 0, textLen = wordArray.wordLen(); index < textLen - 1; index++) {  
            for (int i = 2; i <= thisWordMaxLen; i++) {  
                int toIndex = index + i;  
                if (toIndex > textLen)  
                    break;  
                String word = wordArray.subWords(index, toIndex);  
                addWord(word);  
                if (left != null)  
                    wordsMap.get(word).leftAdd(left);  
                if (toIndex + 1 <= textLen)  
                    wordsMap.get(word).rightAdd(wordArray.subWords(toIndex, toIndex + 1));  
            }  
            left = wordArray.subWords(index, index + 1);  
        }  
  
        for (String s : wordArray.getChineseWords()) {  
           addWord(s);  
        }  
        for (String s : wordArray.getEnglishWords()) {  
            addEnglishWord(s);  
        }  
    }  
  
    private void addWord(String word) {  
        if (word.length() == 0)  
            throw new IllegalArgumentException("word length is 0.");  
        if (wordsMap.containsKey(word))  
            wordsMap.get(word).getTf().incrementAndGet();  
        else  
            wordsMap.put(word, new Word(word));  
    }  
  
    private void addEnglishWord(String word) {  
        addWord(word);  
        wordsMap.get(word).setAllEnglish(true);  
    }  
  
    public void parse(boolean needPretreatment, Collection<String> texts) {  
        parse(needPretreatment, texts.toArray(new String[texts.size()]));  
    }  
  
    public List<String> print() {  
        return print(getRes());  
    }  
  
    public List<String> print(List<Word> words) {  
        List<String> res = new ArrayList<String>();  
        for (Word word : words) {  
            res.add(word.toTab());  
        }  
        return res;  
    }  
  
    public List<Word> getRes() {  
        List<Word> words = new ArrayList<Word>(wordsMap.values());  
        words = Lists.newArrayList(Collections2.filter(words, new Predicate<Word>() {  
            @Override  
            public boolean apply(Word word) {  
                return word.getConfidenceLevel() > 1;  
            }  
        }));  
        Collections.sort(words, new Comparator<Word>() {  
            @Override  
            public int compare(Word word1, Word word2) {  
                return word2.tf.get() - word1.tf.get();  
            }  
        });  
        return words;  
    }  
  
    class Word {  
  
        private String word;  
        private AtomicInteger tf;  
        private StringBuilder left;  
        private StringBuilder right;  
        private Double level = null;  
        private boolean isAllEnglish = false;  
  
        Word(String word) {  
            this.word = word;  
            this.tf = new AtomicInteger(1);  
        }  
  
        public String getWord() {  
            return word;  
        }  
  
        public AtomicInteger getTf() {  
            return tf;  
        }  
  
        public void leftAdd(String str) {  
            if(left == null)  
                this.left = new StringBuilder(3);  
            if(this.left.indexOf(str) < 0)  
                this.left.append(str);  
        }  
  
        public int getLeftNum() {  
            if(left == null)  
                return 0;  
            return new WordArray(left.toString()).wordLen();  
        }  
  
        public void rightAdd(String str) {  
            if(right == null)  
                this.right = new StringBuilder(3);  
            if(this.right.indexOf(str) < 0)  
                this.right.append(str);  
        }  
  
        public int getRightNum() {  
            if(right == null)  
                return 0;  
            return new WordArray(right.toString()).wordLen();  
        }  
  
        public void setAllEnglish(boolean allEnglish) {  
            isAllEnglish = allEnglish;  
        }  
  
        private Double getConfidenceLevel() {  
            if (this.level != null)  
                return this.level;  
  
            double allDomSize = FindWordsByWordArray.this.allDomSize;  
  
            if (this.getWord().replaceAll("N","").length() <= 1)  
                return 0d;  
            if (this.getTf().get() < allDomSize / 90)  
                return 0d;  
            double value;  
            if (!this.isAllEnglish) {  
  
                if (this.getLeftNum() < allDomSize / 190)  
                    return 0d;  
                if (this.getRightNum() < allDomSize / 190)  
                    return 0d;  
                if ((this.getRightNum() + this.getLeftNum()) < allDomSize / 90)  
                    return 0d;  
                value = Double.MAX_VALUE;  
  
                WordArray wordArray = new WordArray(this.getWord());  
  
                for (int i = 1; i < wordArray.wordLen(); i++) {  
  
                    int leftTf = wordsMap.get(wordArray.subWords(0, i)).getTf().get();  
  
                    int rightTf = wordsMap.get(wordArray.subWords(i)).getTf().get();  
  
                    double normal = leftTf * rightTf / (allTextLen * allTextLen);  
  
                    double reality = this.getTf().get() * 2 / allTextLen;  
  
                    value = reality / normal < value ? reality / normal : value;  
                }  
  
                int size = this.getLeftNum() > this.getRightNum() ?  
                        this.getRightNum() : this.getLeftNum();  
  
                value = Math.pow(value, mutualInformationPunish) *  
                        Math.pow(size, leftAndRightEntropyPunish)  
                        / wholePunish;  
            } else {  
                value = this.getTf().get() * 15 / allDomSize;  
            }  
            this.level = value;  
            return value;  
        }  
  
        @Override  
        public String toString() {  
            return "Word{" +  
                    "word='" + word + '\'' +  
                    ", tf=" + tf +  
                    ", left=" + cutOff(left.toString(), 15) +  
                    ", right=" + cutOff(right.toString(), 15) +  
                    '}';  
        }  
  
        public String toTab() {  
            return word + '\t' +  
                    tf + '\t' +  
                    level + '\t' +  
                    getLeftNum() + '\t' +  
                    getRightNum();  
        }  
  
        private String cutOff(String str, int max) {  
              if (str.length() > max)  
                str = str.substring(0, max) + "...]";  
            return "(" + new WordArray(str).wordLen() + ")" + str;  
        }  
    }  
  
    public void setWordMaxLen(int wordMaxLen) {  
        this.wordMaxLen = wordMaxLen;  
    }  
  
    public void setMutualInformationPunish(double mutualInformationPunish) {  
        this.mutualInformationPunish = mutualInformationPunish;  
    }  
  
    public void setLeftAndRightEntropyPunish(double leftAndRightEntropyPunish) {  
        this.leftAndRightEntropyPunish = leftAndRightEntropyPunish;  
    }  
  
    public void setWholePunish(double wholePunish) {  
        this.wholePunish = wholePunish;  
    }  
  
    public static void main(String[] args) throws IOException {  
  
        String inputPath = "e:/xiaoshuo.txt";  
        String outputPath = "e:/xiaoshuo_words";  
  
//        String inputPath = "e:/tweet/parse";  
//        String outputPath = "e:/tweet/words";  
  
        File inputFile = new File(inputPath);  
  
        if (inputFile.isFile()) {  
            File pretreatFile = new File("e:/xiaoshuo_p");  
            long domSize = pretreatment(new File(inputPath), pretreatFile);  
            System.out.println(domSize);  
            FindWordsByWordArray findWords = getFindWords(domSize);  
            LineIterator list = FileUtils.lineIterator(pretreatFile, "utf-8");  
            int i = 0;  
            for(String str = list.next(); list.hasNext(); str = list.next()){  
                findWords.parse(false, str);  
                if(i++ % 500000 == 0)  
                    System.out.print(".");  
            }                     
            list.close();  
            FileUtils.writeLines(new File(outputPath), findWords.print());  
        } else {  
            for (String inputFileName : inputFile.list()) {  
                FindWordsByWordArray findWords = getFindWords();  
                List<String> list = FileUtils.readLines(new File(inputPath, inputFileName), "utf-8");  
                findWords.parse(true, Lists.transform(list, new Function<String, String>() {  
                    @Override  
                    public String apply(String s) {  
                        return s.substring(s.split("\t")[0].length());  
                    }  
                }));  
                String outputFileName = inputFileName + "-words.";  
                if (inputFileName.split("\\.").length == 2)  
                    outputFileName = inputFileName.split("\\.")[0] + "-words." +  
                            inputFileName.split("\\.")[1];  
                List<String> printList = findWords.print();  
                if (printList.size() > 500)  
                    printList = printList.subList(0, 500);  
                FileUtils.writeLines(new File(outputPath, outputFileName), printList);  
            }  
        }  
    }  
  
    private static FindWordsByWordArray getFindWords() {  
        return getFindWords(0);  
    }  
  
    private static FindWordsByWordArray getFindWords(long num) {  
        FindWordsByWordArray findWords = new FindWordsByWordArray(num);  
        findWords.setWordMaxLen(Integer.parseInt(resourceBundle.getString("word.max.len")));  
        findWords.setMutualInformationPunish(  
                Double.parseDouble(resourceBundle.getString("mutual.information.punish")));  
        findWords.setLeftAndRightEntropyPunish(  
                Double.parseDouble(resourceBundle.getString("left.and.right.entropy.punish")));  
        findWords.setWholePunish(Double.parseDouble(resourceBundle.getString("whole.punish")));  
        return findWords;  
    }  


public class WordArray {  
  
    private String someWord;  
    private List<int[]> enIndexAndLen = null;  
  
    public WordArray(String someWord) {  
        this.someWord = someWord;  
        char[] chars = someWord.toCharArray();  
        for(int i = 0, charsLen = chars.length; i<charsLen; i++) {  
            if(CharUtils.isEnglish(chars[i])) {  
                int index = i;  
                while (++i < charsLen && CharUtils.isEnglish(chars[i]));  
                if(enIndexAndLen == null)  
                    enIndexAndLen = new ArrayList<int[]>();  
                enIndexAndLen.add(new int[]{index, i - index});  
            }  
        }  
    }  
      
    public String subWords(int beginIndex, int endIndex) {  
        int realityBeginIndex = beginIndex;  
        int realityEndIndex = endIndex;  
        if(enIndexAndLen != null) {  
            for(int[] intArray: enIndexAndLen) {  
                if(intArray[0] < realityBeginIndex) {  
                    realityBeginIndex += intArray[1] -1;  
                }  
                if(intArray[0] < realityEndIndex) {  
                    realityEndIndex += intArray[1] - 1;  
                }  
            }  
        }  
        return someWord.substring(realityBeginIndex, realityEndIndex);  
    }  
  
    public String subWords(int beginIndex) {  
        return subWords(beginIndex, wordLen());  
    }  
  
    public int wordLen() {  
        int len = someWord.length();  
        if(enIndexAndLen != null)  
            for(int[] intArray: enIndexAndLen)  
                len -= (intArray[1] - 1);  
        return len;  
    }  
  
    public String[] getEnglishWords() {  
        if(enIndexAndLen != null) {  
            String[] strings = new String[enIndexAndLen.size()];  
            int i = 0;  
            for(int[] intArray: enIndexAndLen)  
                strings[i++] = someWord.substring(intArray[0], intArray[0]+intArray[1]);  
            return strings;  
        }else{  
            return new String[0];  
        }  
    }  
  
    public List<String> getChineseWords() {  
        List<String> strings = new ArrayList<String>();  
        for (char c : someWord.toCharArray()) {  
            if(CharUtils.isEnglish(c))  
                continue;  
            strings.add(String.valueOf(c));  
        }  
        return strings;  
    }  
  
    public static void main(String[] args) {  
        WordArray wordArray = new WordArray("我爱Style江南的music哈");  
        System.out.println(wordArray.subWords(0, 5));  
        System.out.println(wordArray.subWords(5, wordArray.wordLen()));  
        System.out.println(wordArray.wordLen());  
        System.out.println(Arrays.toString(wordArray.getEnglishWords()));  
    }  
  
}  



#(int)[2-n default=10] word max len.  
word.max.len = 5  
  
#(double)[1.0-0.0 default=0.3] MutualInformation punish.  
mutual.information.punish = 0.5  
  
#(double)[1.0-0.0 default=1.0] LeftAndRightEntropy punish.  
left.and.right.entropy.punish = 1  
  
#(double)[1-n default=10] WholePunish punish.  
whole.punish = 10  
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics