Code source de elgeopaso.utils.text_toolbelt

#! python3  # noqa: E265


"""
    Tool.
"""


# ###########################################################################
# ######### Libraries #############
# #################################

# Standard library
import html
import logging
import re

# 3rd party
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# submodules
from .custom_stopwords import TUP_CUSTOM_STOPWORDS

# ##############################################################################
# ########## Globals ###############
# ##################################

# logs
logger = logging.getLogger(__name__)

# timestamps format helpers
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")


# ############################################################################
# ########## Classes ##############
# #################################


[docs]class TextToolbelt:
    """Tools to manipulate text: tokenize, clean, etc."""

    def __init__(self):
        """Instanciate module."""
        super(TextToolbelt, self).__init__()

[docs]    @classmethod
    def remove_html_markups(cls, html_text: str, cleaner: str = "bs-lxml") -> str:
        """Very basic cleaner for HTML markups.

        :param str html_text: text to be clean
        :param str cleaner: Which lib to use to clean the text:
          - "bs-lxml": Beautifulsoup4 + LXML - Default.
          - "psl-only": Python Standard Library only (html + regex)

        :return: clean text
        :rtype: str
        """
        # with BeautifulSoup + LXML
        if cleaner == "bs-lxml":
            cleaned_text = BeautifulSoup(html_text, "lxml").text
        elif cleaner == "psl-only":
            # convert HTML5 characters into str.
            # See: https://docs.python.org/3/library/html.html#html.unescape
            html_text = html.unescape(html_text)
            cleaned_text = _regex_markups.sub(" ", html_text)

        return cleaned_text

[docs]    @classmethod
    def tokenize(cls, input_content: str) -> list:
        """Extraction of words mentioned into the offers. The goal is to perform
        a semantic analysis. Mainly based on NLTK: https://www.nltk.org/.

        :param str input_content: input text to parse and tokenize

        :return: list of toknized words
        :rtype: list
        """
        # get list of common French words to filter
        stop_fr = set(stopwords.words("french"))  # add specific French

        # custom list
        contenu = BeautifulSoup(input_content, "html.parser")
        contenu = contenu.get_text("\n")
        contenu = cls.remove_html_markups(input_content)
        # contenu = self.clean_xml(contenu)
        contenu_tokenized = nltk.word_tokenize(contenu)

        # stop words filter
        for mot in contenu_tokenized:
            if mot in stop_fr or mot in TUP_CUSTOM_STOPWORDS:
                contenu_tokenized = list(filter((mot).__ne__, contenu_tokenized))

        logger.debug("Words parsed: {}".format(len(contenu_tokenized)))

        return contenu_tokenized


# ############################################################################
# #### Stand alone program ########
# #################################
if __name__ == "__main__":
    """standalone execution."""
    pass