Source code for indra_cogex.sources.processor_util

# See https://neo4j.com/docs/operations-manual/4.4/tools/neo4j-admin/neo4j-admin-import/#import-tool-header-format-properties
# and
# https://neo4j.com/docs/api/python-driver/current/api.html#data-types
# for available data types.
from typing import Literal, Any

NEO4J_DATA_TYPES = (
    "int",
    "long",
    "float",
    "double",
    "boolean",
    "byte",
    "short",
    "char",
    "string",
    "point",
    "date",
    "localtime",
    "time",
    "localdatetime",
    "datetime",
    "duration",
    # Used in node files
    "ID",
    "LABEL",
    # Used in relationship files
    "START_ID",
    "END_ID",
    "TYPE",
)

DataTypes = Literal[
    "int",
    "long",
    "float",
    "double",
    "boolean",
    "byte",
    "short",
    "char",
    "string",
    "point",
    "date",
    "localtime",
    "time",
    "localdatetime",
    "datetime",
    "duration",
    "ID",
    "LABEL",
    "START_ID",
    "END_ID",
    "TYPE",
]


[docs] class DataTypeError(TypeError): """Raised when a data value is not of the expected type"""
[docs] class UnknownTypeError(TypeError): """Raised when a data type is not recognized."""
[docs] def data_validator(data_type: str, value: Any): """Validate that the data type matches the value. Parameters ---------- data_type : The Neo4j data type to validate against. value : The value to validate. Raises ------ DataTypeError If the value does not validate against the Neo4j data type. UnknownTypeError If data_type is not recognized as a Neo4j data type. """ # None's are provided in the data dictionaries upon initial # node/relationship generation as a missing/null value. Once dumped, # the None's are converted to empty strings which is read in when nodes # are assembled. If we encounter a null value, there is no need to # validate it. null_data = {None, ""} if value in null_data: return if isinstance(value, str): value_list = value.split(";") if data_type.endswith("[]") else [value] else: value_list = [value] value_list = [val for val in value_list if val not in null_data] if not value_list: return data_type = data_type.rstrip("[]") if data_type == "int" or data_type == "long" or data_type == "short": for val in value_list: if isinstance(val, str): # Try to convert to int try: val = int(val) except ValueError as e: raise DataTypeError( f"Data value '{val}' is of the wrong type to conform " f"with Neo4j type {data_type}. Expected a value of " f"type int, but got value of type str with value " f"'{val}' instead." ) from e if not isinstance(val, int): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type int, " f"but got value of type {type(val)} instead." ) elif data_type == "float" or data_type == "double": for val in value_list: if isinstance(val, str): # Try to convert to float try: val = float(val) except ValueError as e: raise DataTypeError( f"Data value '{val}' is of the wrong type to conform " f"with Neo4j type {data_type}. Expected a value of " f"type float, but got value of type str with value " f"'{val}' instead." ) from e if not isinstance(val, float): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type float, " f"but got value of type {type(val)} instead." ) elif data_type == "boolean": for val in value_list: if not isinstance(val, str) or val not in ("true", "false"): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type str " f"with literal value 'true' or 'false', but got value of " f"type {type(val)} with value '{val}' instead." ) elif data_type == "byte": for val in value_list: if not isinstance(val, (bytes, int)): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type bytes " f"or int, but got value of type {type(val)} instead." ) elif data_type == "char": for val in value_list: if not isinstance(val, str): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type str, " f"but got value of type {type(val)} instead." ) elif data_type == "string": for val in value_list: # Catch string representations of numbers if isinstance(val, (int, float)): try: val = str(val) except ValueError as e: raise DataTypeError( f"Data value '{val}' is of the wrong type to conform " f"with Neo4j type {data_type}. Expected a value of " f"type str, int or float, but got value of type " f"{type(val)} instead." ) from e if not isinstance(val, str): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type str, " f"int or float, but got value of type {type(val)} instead." ) elif data_type == "point": raise NotImplementedError( "Neo4j point data type validation is not implemented" ) # Todo: make stricter validation for dates and times: # https://neo4j.com/docs/cypher-manual/4.4/syntax/temporal/#cypher-temporal-instants elif data_type in [ "date", "localtime", "time", "localdatetime", "datetime", "duration", ]: for val in value_list: if not isinstance(val, (str, int)): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type str " f"or int, but got value of type {type(val)} instead." ) elif data_type in ["ID", "LABEL", "START_ID", "END_ID", "TYPE"]: for val in value_list: if not isinstance(val, (str, int)): raise DataTypeError( f"Data value '{val}' is of the wrong type to conform with " f"Neo4j type {data_type}. Expected a value of type str " f"or int, but got value of type {type(val)} instead." ) else: raise UnknownTypeError( f"{data_type} is not recognized as a Neo4j data type." )