Code source de elgeopaso.jobs.analyzer.georezo.parsers.title

#! python3  # noqa: E265


"""
    Title parser.
"""


# ###########################################################################
# ######### Libraries #############
# #################################

# Standard library
import logging
import re

# project modules
from elgeopaso.jobs.models import (
    Contract,
    ContractVariations,
    JobPosition,
    JobPositionVariations,
    Place,
    PlaceVariations,
)
from elgeopaso.utils import TextToolbelt

# ##############################################################################
# ########## Globals ###############
# ##################################

# logs
logger = logging.getLogger(__name__)

# timestamps format helpers
_regex_markups = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

# shortcuts
txt_toolbelt = TextToolbelt()

# ############################################################################
# ########## Classes ##############
# #################################


[docs]class TitleParser:
    """Parse title of offers published on GeoRezo to extract informations.

    :param int offer_id: offer ID (for tracing purposes)
    :param str input_title: title to parse
    """

    def __init__(self, offer_id: int, input_title: str):
        """Instanciate title parser module."""
        # parameters
        self.offer_id = offer_id
        self.input_title = input_title

        # tokenize content
        self.tokenized_title = txt_toolbelt.tokenize(self.input_title)

    # PARSERS ----------------------------------------------------------------
[docs]    def parse_contract_type(self) -> Contract:
        """Extraction of types of contracts: CDI, CDD, mission, volontariat, etc.

        In theory, offer's title is formatted to contain the type between []...
        """
        # clean the title: excluding text out of brackets
        try:
            contract = self.input_title.split("[")[1].split("]")[0]
        except IndexError:
            logging.warning(
                "Title bad formatted. Offer RSS ID: {}".format(self.offer_id)
            )
            contract = self.input_title.split("]")[0].lstrip("[")

        logging.debug("Contract extracted from title: {}".format(contract))
        contract = contract.lower()

        # find a contract match
        if Contract.objects.filter(abbrv=contract).exists():
            return Contract.objects.get(label=contract)
        elif ContractVariations.objects.filter(label=contract).exists():
            contract_var = ContractVariations.objects.get(label=contract).name
            return Contract.objects.get(abbrv=contract_var)
        else:
            return Contract.objects.get(abbrv="ND")

[docs]    def parse_jobs_positions(self) -> list:
        """Identify job position ('métier') from passed string."""
        jobs_positions_matched = []

        # parse tokenized
        for word in self.tokenized_title:
            if JobPositionVariations.objects.filter(label=word.lower()).exists():
                job_label = JobPositionVariations.objects.get(label=word.lower()).name
                jobs_positions_matched.append(JobPosition.objects.get(name=job_label))
            else:
                continue

        logger.debug("Jobs positions identified: {}".format(jobs_positions_matched))
        return jobs_positions_matched

[docs]    def parse_place(self, mode: int = 0) -> Place:
        """
        Extraction of types of contracts: CDI, CDD, mission, volontariat, etc.
        In theory, place information is wihtin parenthesis '()'.

        :param int mode: 0 = STRICT regex (default): only digits between ()
                         1 = MEDIUM regex: alphanumeric between ()
                         2 = SOFT regex: alphanumeric code outside ()
        """
        # removing contract type between []
        try:
            title = self.input_title.split("[")[1].split("]")[1]
            logging.debug("Title without contract: {}".format(title))
        except IndexError:
            logging.error("Title bad formatted. Offer RSS ID: {}".format(self.offer_id))
            title = self.input_title

        # extract with regex
        if not mode:
            dpt_code = re.findall(r"\((\d+)\)", title)
            logging.debug("STRICT regex applied: {}".format(dpt_code))
        elif mode == 1:
            dpt_code = re.findall(r"\((2[AB]|[0-9]+)\)", title)
            logging.debug("MEDIUM regex applied: {}".format(dpt_code))
        elif mode == 2:
            dpt_code = re.findall(r"(2[AB]|[0-9]+)", title)
            logging.debug("SOFT regex applied: {}".format(dpt_code))
        else:
            raise TypeError("'mode' parameter only accepts an integer [0-2]")

        # match French department code
        if len(dpt_code) == 1:
            if Place.objects.filter(code=dpt_code[0]).exists():
                place_name = Place.objects.get(code=dpt_code[0]).name
                logging.debug("Place code MATCHED in title: {}".format(dpt_code))
                return Place.objects.get(name=place_name)
            else:
                logging.debug("Place code MATCHED in title: {}".format(title))
                # try again
                if mode < 2:
                    return self.parse_place(mode=mode + 1)
                else:
                    pass
        elif len(dpt_code) > 1:
            logging.warning(
                "More than possible department code found: {}.".format(
                    ";".join(dpt_code)
                )
            )
            # try again
            if mode < 2:
                return self.parse_place(mode=mode + 1)
            else:
                pass
        elif not len(dpt_code):
            logging.warning(
                "No place code found in title." " Trying to find a place anyway..."
            )
            t_place = title[title.find("(") + 1 : title.find(")")]
            if "," in t_place:
                t_place = t_place.lower().split(",")
            else:
                t_place = t_place.lower().split()
                pass
            # try to get a match in place variations
            for i in t_place:
                if PlaceVariations.objects.filter(label=i).exists():
                    pv = PlaceVariations.objects.get(label=i).name
                    logging.debug("Place found: {}".format(i))
                    return Place.objects.get(name=pv)
                else:
                    logging.debug("No place found in: {}".format(i))
            # try again
            if mode < 2:
                return self.parse_place(mode=mode + 1)
            else:
                logging.warning("No place found in title: {}".format(self.input_title))
                pass

        # method ending if no place found during various attempts
        return "ND"


# ############################################################################
# #### Stand alone program ########
# #################################
if __name__ == "__main__":
    """standalone execution."""
    pass