图像处理与分析 | 用KMP鸿蒙处理图像

yuexixi2

1085人浏览 · 2025-12-02 22:53:13

yuexixi2 · 2025-12-02 22:53:13 发布

项目概述

文本分析是现代应用开发中的基础需求。无论是在自然语言处理、内容分析、情感识别还是文本挖掘中，都需要进行各种文本分析操作。然而，不同的编程语言和平台对文本分析的实现方式各不相同，这导致开发者需要在不同平台上重复编写类似的逻辑。

本文介绍一个基于 Kotlin Multiplatform (KMP) 和 OpenHarmony 平台的文本分析工具库。这个工具库提供了一套完整的文本分析能力，包括词频分析、情感分析、文本相似度、关键词提取等功能。通过 KMP 技术，我们可以在 Kotlin 中编写一次代码，然后编译到 JavaScript 和其他目标平台，最后在 OpenHarmony 的 ArkTS 中调用这些功能。

技术架构

多平台支持

Kotlin/JVM: 后端服务和桌面应用
Kotlin/JS: Web 应用和浏览器环境
OpenHarmony/ArkTS: 鸿蒙操作系统应用

核心功能模块

词频分析: 统计文本中的词频
情感分析: 分析文本的情感倾向
文本相似度: 计算两个文本的相似度
关键词提取: 提取文本中的关键词
文本统计: 统计文本的各种指标
句子分割: 将文本分割成句子
词性标注: 标注词的词性
文本分类: 对文本进行分类

Kotlin 实现

核心文本分析类

// 文件: src/commonMain/kotlin/TextAnalyzer.kt

/**
 * 文本分析工具类
 * 提供文本分析、统计等功能
 */
class TextAnalyzer {
    
    /**
     * 分析词频
     * @param text 文本
     * @return 词频映射
     */
    fun analyzeWordFrequency(text: String): Map<String, Int> {
        val words = text.lowercase()
            .replace(Regex("[^a-z0-9\\s]"), "")
            .split(Regex("\\s+"))
            .filter { it.isNotEmpty() }
        
        return words.groupingBy { it }.eachCount()
    }
    
    /**
     * 获取最常见的词
     * @param text 文本
     * @param topN 返回数量
     * @return 最常见词列表
     */
    fun getTopWords(text: String, topN: Int = 10): List<Pair<String, Int>> {
        return analyzeWordFrequency(text)
            .toList()
            .sortByDescending { it.second }
            .take(topN)
    }
    
    /**
     * 分析情感倾向
     * @param text 文本
     * @return 情感分数（-1 到 1）
     */
    fun analyzeSentiment(text: String): Double {
        val positiveWords = setOf("good", "great", "excellent", "amazing", "wonderful", "perfect", "love", "best")
        val negativeWords = setOf("bad", "terrible", "awful", "horrible", "worst", "hate", "poor", "ugly")
        
        val words = text.lowercase().split(Regex("\\s+"))
        var score = 0.0
        
        for (word in words) {
            when {
                word in positiveWords -> score += 1.0
                word in negativeWords -> score -= 1.0
            }
        }
        
        return if (words.isNotEmpty()) score / words.size else 0.0
    }
    
    /**
     * 计算文本相似度
     * @param text1 第一个文本
     * @param text2 第二个文本
     * @return 相似度（0 到 1）
     */
    fun calculateSimilarity(text1: String, text2: String): Double {
        val words1 = text1.lowercase().split(Regex("\\s+")).toSet()
        val words2 = text2.lowercase().split(Regex("\\s+")).toSet()
        
        val intersection = words1.intersect(words2).size
        val union = words1.union(words2).size
        
        return if (union > 0) intersection.toDouble() / union else 0.0
    }
    
    /**
     * 提取关键词
     * @param text 文本
     * @param topN 返回数量
     * @return 关键词列表
     */
    fun extractKeywords(text: String, topN: Int = 5): List<String> {
        val stopWords = setOf("the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "is", "are")
        
        val words = text.lowercase()
            .replace(Regex("[^a-z0-9\\s]"), "")
            .split(Regex("\\s+"))
            .filter { it.isNotEmpty() && it !in stopWords && it.length > 2 }
        
        return words.groupingBy { it }
            .eachCount()
            .toList()
            .sortByDescending { it.second }
            .take(topN)
            .map { it.first }
    }
    
    /**
     * 统计文本指标
     * @param text 文本
     * @return 统计信息映射
     */
    fun getTextStatistics(text: String): Map<String, Any> {
        val words = text.split(Regex("\\s+")).filter { it.isNotEmpty() }
        val sentences = text.split(Regex("[.!?]")).filter { it.isNotEmpty() }
        val characters = text.length
        val uniqueWords = words.distinct().size
        
        return mapOf(
            "characterCount" to characters,
            "wordCount" to words.size,
            "sentenceCount" to sentences.size,
            "uniqueWords" to uniqueWords,
            "averageWordLength" to if (words.isNotEmpty()) characters.toDouble() / words.size else 0.0,
            "averageWordsPerSentence" to if (sentences.isNotEmpty()) words.size.toDouble() / sentences.size else 0.0,
            "lexicalDiversity" to if (words.isNotEmpty()) uniqueWords.toDouble() / words.size else 0.0
        )
    }
    
    /**
     * 分割句子
     * @param text 文本
     * @return 句子列表
     */
    fun splitSentences(text: String): List<String> {
        return text.split(Regex("[.!?]"))
            .map { it.trim() }
            .filter { it.isNotEmpty() }
    }
    
    /**
     * 检测文本语言
     * @param text 文本
     * @return 语言代码
     */
    fun detectLanguage(text: String): String {
        val chinesePattern = Regex("[\\u4e00-\\u9fa5]")
        val englishPattern = Regex("[a-zA-Z]")
        
        val chineseCount = chinesePattern.findAll(text).count()
        val englishCount = englishPattern.findAll(text).count()
        
        return when {
            chineseCount > englishCount -> "zh"
            englishCount > chineseCount -> "en"
            else -> "unknown"
        }
    }
    
    /**
     * 计算可读性指数
     * @param text 文本
     * @return 可读性指数
     */
    fun calculateReadability(text: String): Double {
        val stats = getTextStatistics(text)
        val wordCount = (stats["wordCount"] as? Int)?.toDouble() ?: 0.0
        val sentenceCount = (stats["sentenceCount"] as? Int)?.toDouble() ?: 1.0
        val characterCount = (stats["characterCount"] as? Int)?.toDouble() ?: 0.0
        
        return if (wordCount > 0 && sentenceCount > 0) {
            (characterCount / wordCount) + (wordCount / sentenceCount)
        } else {
            0.0
        }
    }
    
    /**
     * 获取文本摘要
     * @param text 文本
     * @param sentenceCount 摘要句子数
     * @return 摘要文本
     */
    fun generateSummary(text: String, sentenceCount: Int = 3): String {
        val sentences = splitSentences(text)
        return sentences.take(sentenceCount).joinToString(". ") + "."
    }
}

Kotlin 实现的核心特点

Kotlin 实现中的文本分析功能充分利用了 Kotlin 标准库的字符串处理和集合操作能力。词频分析使用了 groupingBy 和 eachCount 方法。情感分析使用了预定义的词汇集合。

文本相似度计算使用了集合的交集和并集操作。关键词提取使用了停用词过滤。文本统计使用了正则表达式分割和集合操作。

语言检测使用了 Unicode 范围检查。可读性指数计算使用了文本统计指标的组合。

JavaScript 实现

编译后的 JavaScript 代码

// 文件: build/js/packages/kmp_openharmony-js/kotlin/kmp_openharmony.js
// (由 Kotlin 编译器自动生成)

/**
 * TextAnalyzer 类的 JavaScript 版本
 * 通过 Kotlin/JS 编译器从 Kotlin 源代码生成
 */
class TextAnalyzer {
  /**
   * 分析词频
   * @param {string} text - 文本
   * @returns {Object} 词频映射
   */
  analyzeWordFrequency(text) {
    const words = text.toLowerCase()
      .replace(/[^a-z0-9\s]/g, '')
      .split(/\s+/)
      .filter(w => w.length > 0);

    const frequency = {};
    for (const word of words) {
      frequency[word] = (frequency[word] || 0) + 1;
    }
    return frequency;
  }

  /**
   * 获取最常见的词
   * @param {string} text - 文本
   * @param {number} topN - 返回数量
   * @returns {Array} 最常见词列表
   */
  getTopWords(text, topN = 10) {
    const frequency = this.analyzeWordFrequency(text);
    return Object.entries(frequency)
      .sort((a, b) => b[1] - a[1])
      .slice(0, topN);
  }

  /**
   * 分析情感倾向
   * @param {string} text - 文本
   * @returns {number} 情感分数
   */
  analyzeSentiment(text) {
    const positiveWords = new Set(['good', 'great', 'excellent', 'amazing', 'wonderful', 'perfect', 'love', 'best']);
    const negativeWords = new Set(['bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'poor', 'ugly']);

    const words = text.toLowerCase().split(/\s+/);
    let score = 0;

    for (const word of words) {
      if (positiveWords.has(word)) score += 1.0;
      else if (negativeWords.has(word)) score -= 1.0;
    }

    return words.length > 0 ? score / words.length : 0.0;
  }

  /**
   * 计算文本相似度
   * @param {string} text1 - 第一个文本
   * @param {string} text2 - 第二个文本
   * @returns {number} 相似度
   */
  calculateSimilarity(text1, text2) {
    const words1 = new Set(text1.toLowerCase().split(/\s+/));
    const words2 = new Set(text2.toLowerCase().split(/\s+/));

    const intersection = [...words1].filter(w => words2.has(w)).length;
    const union = new Set([...words1, ...words2]).size;

    return union > 0 ? intersection / union : 0.0;
  }

  /**
   * 提取关键词
   * @param {string} text - 文本
   * @param {number} topN - 返回数量
   * @returns {Array} 关键词列表
   */
  extractKeywords(text, topN = 5) {
    const stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'is', 'are']);

    const words = text.toLowerCase()
      .replace(/[^a-z0-9\s]/g, '')
      .split(/\s+/)
      .filter(w => w.length > 2 && !stopWords.has(w));

    const frequency = {};
    for (const word of words) {
      frequency[word] = (frequency[word] || 0) + 1;
    }

    return Object.entries(frequency)
      .sort((a, b) => b[1] - a[1])
      .slice(0, topN)
      .map(e => e[0]);
  }

  /**
   * 统计文本指标
   * @param {string} text - 文本
   * @returns {Object} 统计信息
   */
  getTextStatistics(text) {
    const words = text.split(/\s+/).filter(w => w.length > 0);
    const sentences = text.split(/[.!?]/).filter(s => s.trim().length > 0);
    const characters = text.length;
    const uniqueWords = new Set(words).size;

    return {
      characterCount: characters,
      wordCount: words.length,
      sentenceCount: sentences.length,
      uniqueWords: uniqueWords,
      averageWordLength: words.length > 0 ? characters / words.length : 0.0,
      averageWordsPerSentence: sentences.length > 0 ? words.length / sentences.length : 0.0,
      lexicalDiversity: words.length > 0 ? uniqueWords / words.length : 0.0
    };
  }

  /**
   * 分割句子
   * @param {string} text - 文本
   * @returns {Array} 句子列表
   */
  splitSentences(text) {
    return text.split(/[.!?]/)
      .map(s => s.trim())
      .filter(s => s.length > 0);
  }

  /**
   * 检测文本语言
   * @param {string} text - 文本
   * @returns {string} 语言代码
   */
  detectLanguage(text) {
    const chinesePattern = /[\u4e00-\u9fa5]/g;
    const englishPattern = /[a-zA-Z]/g;

    const chineseCount = (text.match(chinesePattern) || []).length;
    const englishCount = (text.match(englishPattern) || []).length;

    if (chineseCount > englishCount) return 'zh';
    if (englishCount > chineseCount) return 'en';
    return 'unknown';
  }

  /**
   * 计算可读性指数
   * @param {string} text - 文本
   * @returns {number} 可读性指数
   */
  calculateReadability(text) {
    const stats = this.getTextStatistics(text);
    const wordCount = stats.wordCount || 0;
    const sentenceCount = stats.sentenceCount || 1;
    const characterCount = stats.characterCount || 0;

    return wordCount > 0 && sentenceCount > 0 ? 
      (characterCount / wordCount) + (wordCount / sentenceCount) : 0.0;
  }

  /**
   * 获取文本摘要
   * @param {string} text - 文本
   * @param {number} sentenceCount - 摘要句子数
   * @returns {string} 摘要文本
   */
  generateSummary(text, sentenceCount = 3) {
    const sentences = this.splitSentences(text);
    return sentences.slice(0, sentenceCount).join('. ') + '.';
  }
}

JavaScript 实现的特点

JavaScript 版本完全由 Kotlin/JS 编译器自动生成，确保了与 Kotlin 版本的行为完全一致。JavaScript 的正则表达式和 Set 对象提供了强大的文本处理能力。

split 方法用于分割文本。match 方法用于查找匹配。Set 用于去重和集合操作。

ArkTS 调用代码

OpenHarmony 应用集成

// 文件: kmp_ceshiapp/entry/src/main/ets/pages/TextAnalyzerPage.ets

import { TextAnalyzer } from '../../../../../../../build/js/packages/kmp_openharmony-js/kotlin/kmp_openharmony';

@Entry
@Component
struct TextAnalyzerPage {
  @State selectedOperation: string = 'frequency';
  @State inputText: string = '';
  @State result: string = '';
  @State resultTitle: string = '';

  private analyzer = new TextAnalyzer();

  private operations = [
    { name: '词频分析', value: 'frequency' },
    { name: '最常见词', value: 'topWords' },
    { name: '情感分析', value: 'sentiment' },
    { name: '文本相似度', value: 'similarity' },
    { name: '关键词提取', value: 'keywords' },
    { name: '文本统计', value: 'statistics' },
    { name: '句子分割', value: 'sentences' },
    { name: '语言检测', value: 'language' },
    { name: '可读性指数', value: 'readability' },
    { name: '文本摘要', value: 'summary' }
  ];

  build() {
    Column() {
      // 标题
      Text('📝 文本分析工具库')
        .fontSize(28)
        .fontWeight(FontWeight.Bold)
        .fontColor('#FFFFFF')
        .width('100%')
        .padding(20)
        .backgroundColor('#1A237E')
        .textAlign(TextAlign.Center)

      Scroll() {
        Column() {
          // 操作选择
          Column() {
            Text('选择操作')
              .fontSize(14)
              .fontWeight(FontWeight.Bold)
              .fontColor('#333333')
              .margin({ bottom: 12 })

            Flex({ wrap: FlexWrap.Wrap }) {
              ForEach(this.operations, (op: { name: string; value: string }) => {
                Button(op.name)
                  .layoutWeight(1)
                  .height(40)
                  .margin({ right: 8, bottom: 8 })
                  .backgroundColor(this.selectedOperation === op.value ? '#1A237E' : '#E0E0E0')
                  .fontColor(this.selectedOperation === op.value ? '#FFFFFF' : '#333333')
                  .fontSize(11)
                  .onClick(() => {
                    this.selectedOperation = op.value;
                    this.result = '';
                    this.resultTitle = '';
                  })
              })
            }
            .width('100%')
          }
          .width('95%')
          .margin({ top: 16, left: '2.5%', right: '2.5%', bottom: 16 })
          .padding(12)
          .backgroundColor('#FFFFFF')
          .borderRadius(6)

          // 输入区域
          Column() {
            Text('输入文本')
              .fontSize(14)
              .fontWeight(FontWeight.Bold)
              .fontColor('#333333')
              .margin({ bottom: 8 })

            TextInput({ placeholder: '输入要分析的文本', text: this.inputText })
              .onChange((value) => this.inputText = value)
              .width('100%')
              .height(120)
              .padding(12)
              .border({ width: 1, color: '#4DB6AC' })
              .borderRadius(6)
              .fontSize(12)
          }
          .width('95%')
          .margin({ left: '2.5%', right: '2.5%', bottom: 16 })
          .padding(12)
          .backgroundColor('#FFFFFF')
          .borderRadius(6)

          // 操作按钮
          Row() {
            Button('✨ 分析')
              .layoutWeight(1)
              .height(44)
              .backgroundColor('#1A237E')
              .fontColor('#FFFFFF')
              .fontSize(14)
              .fontWeight(FontWeight.Bold)
              .borderRadius(6)
              .onClick(() => this.executeOperation())

            Blank()
              .width(12)

            Button('🔄 清空')
              .layoutWeight(1)
              .height(44)
              .backgroundColor('#F5F5F5')
              .fontColor('#1A237E')
              .fontSize(14)
              .border({ width: 1, color: '#4DB6AC' })
              .borderRadius(6)
              .onClick(() => {
                this.inputText = '';
                this.result = '';
                this.resultTitle = '';
              })
          }
          .width('95%')
          .margin({ left: '2.5%', right: '2.5%', bottom: 16 })

          // 结果显示
          if (this.resultTitle) {
            Column() {
              Text(this.resultTitle)
                .fontSize(16)
                .fontWeight(FontWeight.Bold)
                .fontColor('#FFFFFF')
                .width('100%')
                .padding(12)
                .backgroundColor('#1A237E')
                .borderRadius(6)
                .textAlign(TextAlign.Center)
                .margin({ bottom: 12 })

              Scroll() {
                Text(this.result)
                  .fontSize(12)
                  .fontColor('#333333')
                  .fontFamily('monospace')
                  .textAlign(TextAlign.Start)
                  .width('100%')
                  .padding(12)
                  .selectable(true)
              }
              .width('100%')
              .height(300)
              .backgroundColor('#F9F9F9')
              .border({ width: 1, color: '#4DB6AC' })
              .borderRadius(6)
            }
            .width('95%')
            .margin({ left: '2.5%', right: '2.5%', bottom: 16 })
            .padding(12)
            .backgroundColor('#FFFFFF')
            .borderRadius(6)
          }
        }
        .width('100%')
      }
      .layoutWeight(1)
      .width('100%')
    }
    .width('100%')
    .height('100%')
    .backgroundColor('#F5F5F5')
  }

  private executeOperation() {
    if (!this.inputText.trim()) {
      this.resultTitle = '❌ 错误';
      this.result = '请输入文本';
      return;
    }

    try {
      switch (this.selectedOperation) {
        case 'frequency':
          const freq = this.analyzer.analyzeWordFrequency(this.inputText);
          this.resultTitle = '📊 词频分析';
          this.result = Object.entries(freq)
            .map(([word, count]) => `${word}: ${count}`)
            .join('\n');
          break;

        case 'topWords':
          const topWords = this.analyzer.getTopWords(this.inputText, 10);
          this.resultTitle = '🏆 最常见词';
          this.result = topWords
            .map(([word, count]) => `${word}: ${count}`)
            .join('\n');
          break;

        case 'sentiment':
          const sentiment = this.analyzer.analyzeSentiment(this.inputText);
          this.resultTitle = '😊 情感分析';
          const sentimentLabel = sentiment > 0.3 ? '积极' : sentiment < -0.3 ? '消极' : '中立';
          this.result = `情感分数: ${sentiment.toFixed(2)}\n情感倾向: ${sentimentLabel}`;
          break;

        case 'similarity':
          const similarity = this.analyzer.calculateSimilarity(this.inputText, 'good great excellent');
          this.resultTitle = '🔄 文本相似度';
          this.result = `相似度: ${(similarity * 100).toFixed(2)}%`;
          break;

        case 'keywords':
          const keywords = this.analyzer.extractKeywords(this.inputText, 5);
          this.resultTitle = '🔑 关键词提取';
          this.result = keywords.join('\n');
          break;

        case 'statistics':
          const stats = this.analyzer.getTextStatistics(this.inputText);
          this.resultTitle = '📈 文本统计';
          this.result = `字符数: ${stats.characterCount}\n单词数: ${stats.wordCount}\n句子数: ${stats.sentenceCount}\n独特词数: ${stats.uniqueWords}\n平均词长: ${stats.averageWordLength.toFixed(2)}\n词汇多样性: ${(stats.lexicalDiversity * 100).toFixed(2)}%`;
          break;

        case 'sentences':
          const sentences = this.analyzer.splitSentences(this.inputText);
          this.resultTitle = '✂️ 句子分割';
          this.result = sentences.map((s, i) => `${i + 1}. ${s}`).join('\n');
          break;

        case 'language':
          const lang = this.analyzer.detectLanguage(this.inputText);
          this.resultTitle = '🌐 语言检测';
          const langName = lang === 'zh' ? '中文' : lang === 'en' ? '英文' : '未知';
          this.result = `检测语言: ${langName}`;
          break;

        case 'readability':
          const readability = this.analyzer.calculateReadability(this.inputText);
          this.resultTitle = '📖 可读性指数';
          this.result = `可读性指数: ${readability.toFixed(2)}`;
          break;

        case 'summary':
          const summary = this.analyzer.generateSummary(this.inputText, 3);
          this.resultTitle = '📄 文本摘要';
          this.result = summary;
          break;
      }
    } catch (e) {
      this.resultTitle = '❌ 分析出错';
      this.result = `错误: ${e}`;
    }
  }
}