Code source de elgeopaso.utils.text_toolbelt

#! python3  # noqa: E265


"""
    Tool.
"""


# ###########################################################################
# ######### Libraries #############
# #################################

# Standard library
import html
import logging
import re

# 3rd party
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# submodules
from .custom_stopwords import TUP_CUSTOM_STOPWORDS

# ##############################################################################
# ########## Globals ###############
# ##################################

# logs
logger = logging.getLogger(__name__)

# timestamps format helpers
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")


# ############################################################################
# ########## Classes ##############
# #################################


[docs]class TextToolbelt: """Tools to manipulate text: tokenize, clean, etc.""" def __init__(self): """Instanciate module.""" super(TextToolbelt, self).__init__()
[docs] @classmethod def remove_html_markups(cls, html_text: str, cleaner: str = "bs-lxml") -> str: """Very basic cleaner for HTML markups. :param str html_text: text to be clean :param str cleaner: Which lib to use to clean the text: - "bs-lxml": Beautifulsoup4 + LXML - Default. - "psl-only": Python Standard Library only (html + regex) :return: clean text :rtype: str """ # with BeautifulSoup + LXML if cleaner == "bs-lxml": cleaned_text = BeautifulSoup(html_text, "lxml").text elif cleaner == "psl-only": # convert HTML5 characters into str. # See: https://docs.python.org/3/library/html.html#html.unescape html_text = html.unescape(html_text) cleaned_text = _regex_markups.sub(" ", html_text) return cleaned_text
[docs] @classmethod def tokenize(cls, input_content: str) -> list: """Extraction of words mentioned into the offers. The goal is to perform a semantic analysis. Mainly based on NLTK: https://www.nltk.org/. :param str input_content: input text to parse and tokenize :return: list of toknized words :rtype: list """ # get list of common French words to filter stop_fr = set(stopwords.words("french")) # add specific French # custom list contenu = BeautifulSoup(input_content, "html.parser") contenu = contenu.get_text("\n") contenu = cls.remove_html_markups(input_content) # contenu = self.clean_xml(contenu) contenu_tokenized = nltk.word_tokenize(contenu) # stop words filter for mot in contenu_tokenized: if mot in stop_fr or mot in TUP_CUSTOM_STOPWORDS: contenu_tokenized = list(filter((mot).__ne__, contenu_tokenized)) logger.debug("Words parsed: {}".format(len(contenu_tokenized))) return contenu_tokenized
# ############################################################################ # #### Stand alone program ######## # ################################# if __name__ == "__main__": """standalone execution.""" pass