美文网首页
004 analyze & analyzer

004 analyze & analyzer

作者: zhu733756 | 来源:发表于2020-03-15 20:49 被阅读0次

#精确值不需要做分词的处理

#对个别字段进行个性化分词

#charactor filter,字符串过滤规则,自带的有html_script,pattern_replace,mapping

#tokenizer:分词器,主要有whitespace,standard,uax_url_email, pattern, keyword, path hierarchy

#filter,跟char_filter一样都能自定义,自带的token filter有lowercase,stop,synonym

#去除标签

POST _analyze

{

  "tokenizer": "keyword",

  "char_filter": ["html_strip"],

  "text":"<div>hello world</div>"

}

#自定义替换规则

POST _analyze

{

  "tokenizer": "standard",

  "char_filter": [

    {

      "type": "mapping",

      "mappings": ["- => _",":) => happy","(: => sad"]

    }

  ],

  "text":":)123213-243243-ddsfsfs-(:"

}

#正则

POST _analyze

{

  "tokenizer": "standard",

  "char_filter": [{

    "type":"pattern_replace",

    "pattern":"http://(.*)",

    "replacement":"$1"

  }],

  "text": ["http://www.baidu.com","http://sb.sx.com"]

}

#whitespace 和 stop

POST _analyze

{

  "tokenizer": "whitespace",

  "filter": ["stop","lowercase"],

  "char_filter": [{

    "type":"pattern_replace",

    "pattern":"[?!.]",

    "replacement":""

  }],

  "text": ["What are you saying? The dog on the tree?"]

}

#自定义analyzer

DELETE my_index

PUT my_index

{

  "settings": {

    "analysis": {

      "analyzer": {

        "my_custom_analyzer": {

          "type": "custom",

          "char_filter": [

            "emotions"

          ],

          "tokenizer": "pun",

          "filter": [

            "lowercase",

            "english_stop"

          ]

        }

      },

      "tokenizer": {

        "pun": {

          "type": "pattern",

          "pattern": "[?!.,。 ]"

        }

      },

      "char_filter": {

        "emotions": {

          "type": "mapping",

          "mappings": [

            "- => _",

            ":) => happy",

            "(: => sad"

          ]

        }

      },

      "filter": {

        "english_stop": {

          "type": "stop",

          "stopwords": "_english_"

        }

      }

    }

  }

}

POST my_index/_analyze

{

  "analyzer": "my_custom_analyzer",

  "text": ":):(I am so happy to use my custom analyzer!!!!.....。。。。"

}

DELETE test

PUT test

{

  "settings": {

    "analysis": {

      "analyzer": {

        "my_hanlp_analyzer": {

          "tokenizer": "hanlp_test",

          "filter":"keep_n_v"

        }

      },

      "tokenizer": {

        "hanlp_test": {

          "type": "hanlp_nlp",

          "enable_stop_dictionary": true,

          "enable_custom_config": true,

          "enable_place_recognize": true,

          "enable_name_recognize": true,

          "enable_part_of_speech_tagging": true,

          "enable_japanese_name_recognize": true

        }

      },

      "filter": {

        "keep_n_v":{

          "type": "keep_types",

          "types": [

            "n","v"

          ]

        }

      }

    }

  }

}

POST test/_analyze

{

  "text": "夜里。夜深人静。房间之中,那副画蠢蠢欲动。蔓延在整个房间之中的树枝开始渐渐开出了樱花。几乎在一夜之间,所有的枝头都挂满了樱花。睡梦之中,卡卡西只觉得一股疲倦感渐渐涌了上来。左手腕上的的白色狐狸面具慢慢消散,最后竟是消失不见。“八坂先生”巳弥满眼心疼地看着那道影子,缓缓靠了上去。随即想到了什么,跳上了樱花树。那满是樱花的树上,巳弥伸出了自己的手臂。黑影缓缓地抬起了头。一切好像又回到了从前。",

  "tokenizer": "hanlp_test",

  "filter": {

    "type": "keep_types",

    "types": [

      "n",

      "nz",

      "v",

      "vi",

      "vl",

      "a"

    ]

  }

}

相关文章

网友评论

      本文标题:004 analyze & analyzer

      本文链接:https://www.haomeiwen.com/subject/almbehtx.html