from __future__ import annotations
import logging
import os
from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, TypeAlias
from pylatexenc.latex2text import LatexNodes2Text
from unidecode import unidecode
from academic_metrics.AI import AbstractClassifier
from academic_metrics.configs import (
configure_logging,
DEBUG,
)
from academic_metrics.enums import AttributeTypes
if TYPE_CHECKING:
from academic_metrics.utils import Utilities
ClassificationResultsDict: TypeAlias = Dict[str, List[str]]
"""Type alias for a dictionary mapping DOIs to lists of classification results.
This type alias is used to represent the return type of the
:meth:`~academic_metrics.AI.AbstractClassifier.get_classification_results_by_doi` method.
"""
ClassificationResultsTuple: TypeAlias = Tuple[
List[str], List[str], List[str], List[str]
]
"""Type alias for a tuple containing lists of classification results.
This type alias is used to represent the return type of the
:meth:`~academic_metrics.AI.AbstractClassifier.get_classification_results_by_doi` method.
Notes:
- Format of the tuple is (top_categories, mid_categories, low_categories, themes)
"""
[docs]
class ClassificationOrchestrator:
"""Manages the classification process for research abstracts.
Orchestrates the process of extracting DOIs and abstracts from research metadata,
classifying them using AbstractClassifier, and integrating results back into
the original data. Tracks unclassified items for monitoring.
Attributes:
abstract_classifier_factory (Callable[..., AbstractClassifier]): Factory function for AbstractClassifier instances.
taxonomy (Taxonomy): Classification hierarchy for AbstractClassifier.
utilities (Utilities): Utilities for attribute extraction.
ai_api_key (str): API key for AI service access.
unclassified_item_count (int): Count of unclassified items.
Type: int
unclassified_dois (List): DOIs of unclassified items.
Type: List[str]
unclassified_abstracts (List): Abstracts of unclassified items.
Type: List[str]
unclassified_doi_abstract_dict (Dict): Maps unclassified DOIs to abstracts.
Type: Dict[str, str]
unclassified_items (List): Complete metadata of unclassified items.
Type: List[Dict[str, Any]]
unclassified_details (Dict): Organized unclassified data.
Type: Dict[str, Union[List[str], List[Dict[str, Any]]]]
Contains:
- dois: List of unclassified DOIs
- abstracts: List of unclassified abstracts
- items: List of unclassified metadata items
Methods:
run_classification() -> List[Dict]: Processes and classifies a list of research metadata dictionaries.
get_unclassified_item_count() -> int: Returns the number of unclassified items.
get_unclassified_dois() -> List[str]: Returns the DOIs of unclassified items.
get_unclassified_abstracts() -> List[str]: Returns the abstracts of unclassified items.
get_unclassified_doi_abstract_dict() -> Dict[str, str]: Returns the DOI to abstract mapping dictionary for unclassified items.
get_unclassified_items() -> List[Dict]: Returns the unclassified items.
get_unclassified_details_dict() -> Dict: Returns the details of unclassified items.
_classification_orchestrator() -> List[Dict]: Core classification logic for processing research metadata.
_inject_categories() -> None: Adds classification results to a research metadata dictionary.
_extract_categories() -> ClassificationResultsDict | ClassificationResultsTuple: Gets classification results for a specific DOI.
_make_doi_abstract_dict() -> Dict[str, str]: Creates a DOI to abstract mapping dictionary.
_retrieve_doi_abstract() -> Tuple[str, str]: Extracts DOI and abstract from a research metadata dictionary.
_update_classified_instance_variables() -> None: Updates tracking variables for unclassified items.
_set_classification_ran_true() -> None: Sets the classification ran flag to true.
_has_ran_classification() -> bool: Checks if classification has been run.
_validate_classification_ran() -> None: Validates if classification has been run.
_normalize_abstract() -> str: Normalizes an abstract by removing LaTeX and converting any resulting unicode to ASCII.
"""
[docs]
def __init__(
self,
abstract_classifier_factory: Callable[[Dict[str, str]], AbstractClassifier],
utilities: Utilities,
):
"""Initialize the ClassificationOrchestrator.
Sets up the orchestrator with required dependencies for classifying research
abstracts and managing the classification process.
Args:
abstract_classifier_factory (Callable): Factory function for AbstractClassifier.
Type: Callable[[Dict[str, str]], :class:`~academic_metrics.AI.abstract_classifier.AbstractClassifier`]
utilities (Utilities): Utilities instance for attribute extraction.
Type: :class:`~academic_metrics.utils.utilities.Utilities`
Returns:
None
Notes:
- Initializes tracking variables for unclassified items
- Sets up classification status flags
- Prepares data structures for results
- Validates factory function compatibility
"""
self.logger = configure_logging(
module_name=__name__,
log_file_name="classification_orchestrator",
log_level=DEBUG,
)
self.abstract_classifier_factory = abstract_classifier_factory
self.utilities = utilities
# flag to check if classification has been run to provide a method for which to prevent retrieval
# of unclassified attributes before classification has been ran
self._classification_ran: bool = False
self.unclassified_item_count: int = 0
self.unclassified_dois: List[str] = []
self.unclassified_abstracts: List[str] = []
self.unclassified_doi_abstract_dict: Dict[str, str] = {}
self.unclassified_items: List[Dict] = []
self.unclassified_details_dict: Dict = {
"dois": [],
"abstracts": [],
"items": [],
}
[docs]
def run_classification(
self,
data: List[Dict],
pre_classification_model: str | None = "gpt-4o-mini",
classification_model: str | None = "gpt-4o-mini",
theme_model: str | None = "gpt-4o-mini",
) -> List[Dict]:
"""Processes and classifies a list of research metadata dictionaries.
Extracts abstracts from research metadata, classifies them using specified
AI models, and injects the classification results back into the original data.
Args:
data (list): List of dictionaries containing research metadata.
Type: List[Dict[str, Any]]
pre_classification_model (str | None): Model for pre-classification processing.
Type: str | None
Defaults to "gpt-4o-mini"
classification_model (str | None): Model for main classification.
Type: str | None
Defaults to "gpt-4o-mini"
theme_model (str | None): Model for theme extraction.
Type: str | None
Defaults to "gpt-4o-mini"
Returns:
List: Modified data with classifications injected.
Type: List[Dict[str, Any]]
Includes:
- Original metadata
- Classification results
- Theme information
- Processing status
Notes:
- Processes each item sequentially
- Tracks unclassified items
- Handles missing abstracts
- Updates internal statistics
- Maintains original data structure
"""
classified_data = self._classification_orchestrator(
data,
pre_classification_model=pre_classification_model,
classification_model=classification_model,
theme_model=theme_model,
)
self._set_classification_ran_true()
return classified_data
[docs]
def get_unclassified_item_count(self) -> int:
"""Gets the number of unclassified items.
Retrieves the count of items that could not be classified during the
classification process.
Returns:
int: Number of unclassified items.
Type: int
Raises:
RuntimeError: If classification has not been run yet
Notes:
- Validates classification status
- Returns current count
- Includes all unclassified types
- Requires prior classification run
"""
self._validate_classification_ran(self._has_ran_classification())
return self.unclassified_item_count
[docs]
def get_unclassified_dois(self) -> List[str]:
"""Gets the DOIs of unclassified items.
Retrieves the list of Digital Object Identifiers (DOIs) for items that
could not be classified during the classification process.
Returns:
List: List of unclassified DOIs.
Type: List[str]
Empty list if all items were classified.
Raises:
RuntimeError: If classification has not been run yet
Notes:
- Validates classification status
- Returns unique DOIs only
- Maintains original DOI format
- Requires prior classification run
"""
self._validate_classification_ran(self._has_ran_classification())
return self.unclassified_dois
[docs]
def get_unclassified_abstracts(self) -> List[str]:
"""Gets the abstracts of unclassified items.
Retrieves the list of research abstracts for items that could not be
classified during the classification process.
Returns:
List: List of unclassified abstracts.
Type: List[str]
Empty list if all items were classified.
Raises:
RuntimeError: If classification has not been run yet
Notes:
- Validates classification status
- Returns normalized abstracts
- Maintains text formatting
- Requires prior classification run
- May include empty abstracts
"""
self._validate_classification_ran(self._has_ran_classification())
return self.unclassified_abstracts
[docs]
def get_unclassified_doi_abstract_dict(self) -> Dict[str, str]:
"""Gets the DOI to abstract mapping dictionary for unclassified items.
Retrieves a dictionary that maps Digital Object Identifiers (DOIs) to their
corresponding abstracts for items that could not be classified.
Returns:
Dict: Dictionary mapping unclassified DOIs to abstracts.
Type: Dict[str, str]
Keys: DOIs (str)
Values: Abstracts (str)
Empty dict if all items were classified.
Raises:
RuntimeError: If classification has not been run yet
Notes:
- Validates classification status
- Maintains DOI-abstract relationships
- Contains normalized abstracts
- Requires prior classification run
- Preserves original DOI format
"""
self._validate_classification_ran(self._has_ran_classification())
return self.unclassified_doi_abstract_dict
[docs]
def get_unclassified_items(self) -> List[Dict]:
"""Gets the unclassified items.
Retrieves the complete list of research items that could not be classified,
including all their original metadata.
Returns:
List: List of unclassified items with full metadata.
Type: List[Dict[str, Any]]
Empty list if all items were classified.
Each dict contains complete item metadata.
Raises:
RuntimeError: If classification has not been run yet
Notes:
- Validates classification status
- Returns complete metadata
- Preserves original structure
- Requires prior classification run
- Maintains all item attributes
"""
self._validate_classification_ran(self._has_ran_classification())
return self.unclassified_items
[docs]
def get_unclassified_details_dict(self) -> Dict:
"""Gets the details of unclassified items.
Retrieves a comprehensive dictionary containing organized information about
all unclassified items, including DOIs, abstracts, and complete metadata.
Returns:
Dict: Organized details of unclassified items.
Type: Dict[str, Union[List[str], List[Dict[str, Any]]]]
Contains:
- dois: List[str] - Unclassified DOIs
- abstracts: List[str] - Unclassified abstracts
- items: List[Dict] - Complete metadata
Raises:
RuntimeError: If classification has not been run yet
Notes:
- Validates classification status
- Provides structured access
- Groups related information
- Requires prior classification run
- Maintains data relationships
"""
self._validate_classification_ran(self._has_ran_classification())
return self.unclassified_details_dict
[docs]
def _classification_orchestrator(
self,
data: List[Dict],
pre_classification_model: str | None = "gpt-4o-mini",
classification_model: str | None = "gpt-4o-mini",
theme_model: str | None = "gpt-4o-mini",
) -> List[Dict]:
"""Core classification logic for processing research metadata.
Implements the main classification workflow, processing research metadata
through multiple stages of classification and theme extraction.
Args:
data (List): List of dictionaries containing research metadata.
Type: List[Dict[str, Any]]
pre_classification_model (str | None): Model for pre-classification processing.
Type: str | None
Defaults to "gpt-4o-mini"
classification_model (str | None): Model for main classification.
Type: str | None
Defaults to "gpt-4o-mini"
theme_model (str | None): Model for theme extraction.
Type: str | None
Defaults to "gpt-4o-mini"
Returns:
List: Modified data with classifications and themes injected.
Type: List[Dict[str, Any]]
Includes:
- Original metadata
- Classification results
- Theme information
- Processing status
Notes:
- Processes items sequentially
- Handles classification failures
- Tracks unclassified items
- Updates internal statistics
- Maintains data integrity
- Manages model selection
"""
i = 0
# This must be a `while` loop because the list `data` is modified during iteration.
# Specifically, items may be removed (via `pop`) when an error occurs. A `for` loop
# uses an iterator tied to the list's initial state and does not account for changes
# to the list structure. Modifying the list while using a `for` loop would cause
# skipped items, incorrect indexing, or runtime errors.
#
# With a `while` loop, we have full control over the index (`i`). If an item is
# removed, the remaining items shift, and the loop naturally rechecks the current
# index without skipping. This ensures every item is processed exactly once, and
# the logic remains robust despite the dynamic list modifications.
#
# Do not attempt to use a `for` loop here—it will not handle these modifications safely.
while i < len(data):
item = data[i]
try:
doi, abstract, extra_context = self._get_classification_dependencies(
item
)
normalized_abstract: str = self._normalize_abstract(abstract)
doi_abstract_dict: Dict[str, str] = self._make_doi_abstract_dict(
doi, normalized_abstract
)
if not doi_abstract_dict:
self._update_classified_instance_variables(
item=item, doi=doi, abstract=abstract
)
i += 1
continue
classifier: AbstractClassifier = self.abstract_classifier_factory(
doi_abstract_dict=doi_abstract_dict,
extra_context=extra_context,
pre_classification_model=pre_classification_model,
classification_model=classification_model,
theme_model=theme_model,
)
classifier.classify()
self._inject_categories(
data=item, categories=self._extract_categories(doi, classifier)
)
i += 1
except Exception as e:
self.logger.error(f"Error processing item {i}: {e}")
self.logger.error(f"Popping the item at index {i} from data")
self.logger.error(f"Full error traceback:", exc_info=True)
self.logger.error(f"Problem item: {item}")
data.pop(i)
return data
[docs]
def _inject_categories(
self,
data: Dict,
categories: ClassificationResultsDict | ClassificationResultsTuple,
) -> None:
"""Adds classification results to a research metadata dictionary.
Injects classification categories and themes into the provided metadata
dictionary, handling both dictionary and tuple result formats.
Args:
data (Dict): Research metadata dictionary.
Type: Dict[str, Any]
categories (Union): Classification results including categories and themes.
Type: Union[ClassificationResultsDict, ClassificationResultsTuple]
Where:
- ClassificationResultsDict: Dict[str, List[str]]
Format: {
"top_categories": List[str],
"mid_categories": List[str],
"low_categories": List[str],
"themes": List[str]
}
- ClassificationResultsTuple: Tuple[List[str], List[str], List[str], List[str]]
Format: (
top_level_categories: List[str],
mid_level_categories: List[str],
low_level_categories: List[str],
themes: List[str]
)
Raises:
ValueError: If categories is neither a dict nor a tuple
Notes:
- Modifies input dictionary in-place
- Handles both result formats
- Preserves existing metadata
- Validates category structure
- Maintains hierarchical relationships
- Provides default empty lists for missing dictionary keys
"""
# Check if it's a dictionary (Dict[str, List[str]])
# Format: {
# "top_categories": List[str],
# "mid_categories": List[str],
# "low_categories": List[str],
# "themes": List[str]
# }
if isinstance(categories, dict):
data["categories"] = {}
data["categories"]["top"] = categories.get("top_categories", [])
data["categories"]["mid"] = categories.get("mid_categories", [])
data["categories"]["low"] = categories.get("low_categories", [])
data["themes"] = categories.get("themes", [])
# Otherwise it must be a tuple (Tuple[List[str], List[str], List[str], List[str]])
# Format: (
# top_level_categories: List[str],
# mid_level_categories: List[str],
# low_level_categories: List[str],
# themes: List[str]
# )
elif isinstance(categories, tuple):
data["categories"] = {}
data["categories"]["top"] = categories[0]
data["categories"]["mid"] = categories[1]
data["categories"]["low"] = categories[2]
data["themes"] = categories[3]
else:
raise ValueError("Invalid categories format")
[docs]
def _make_doi_abstract_dict(self, doi: str, abstract: str) -> Dict[str, str]:
"""Creates a DOI to abstract mapping dictionary.
Constructs a dictionary that maps a given DOI to its corresponding abstract,
ensuring both values are provided.
Args:
doi (str): DOI identifier for the research item.
Type: str
abstract (str): Research abstract text.
Type: str
Returns:
dict: Dictionary mapping DOI to abstract.
Type: Dict[str, str]
Format: {doi: abstract}
Raises:
ValueError: If either DOI or abstract is missing
Notes:
- Ensures both DOI and abstract are non-empty
- Provides a simple mapping structure
- Validates input before mapping
"""
if not doi or not abstract:
raise ValueError("Both DOI and abstract must be provided and non-empty.")
return {doi: abstract}
[docs]
def _get_classification_dependencies(self, item: Dict) -> Tuple[str, str, dict]:
"""Extracts DOI, abstract, and extra context from a research metadata dictionary.
Uses the utilities module to safely extract required attributes from the
research metadata, handling missing or invalid values.
Args:
item (dict): Research metadata dictionary.
Type: Dict[str, Any]
Returns:
tuple: DOI, abstract, and extra context.
Type: Tuple[str, str, dict]
Format: (
doi: str | None,
abstract: str | None,
extra_context: dict | None
)
Notes:
- Uses :class:`~academic_metrics.utils.utilities.Utilities` for extraction
- Extracts attributes:
- :data:`~academic_metrics.enums.enums.AttributeTypes.CROSSREF_DOI`
- :data:`~academic_metrics.enums.enums.AttributeTypes.CROSSREF_ABSTRACT`
- :data:`~academic_metrics.enums.enums.AttributeTypes.CROSSREF_EXTRA_CONTEXT`
- Returns None for any missing attributes
- Preserves original attribute values
- Handles missing or malformed data gracefully
"""
result: Dict[AttributeTypes, Tuple[bool, str]] = self.utilities.get_attributes(
item,
[
AttributeTypes.CROSSREF_DOI,
AttributeTypes.CROSSREF_ABSTRACT,
AttributeTypes.CROSSREF_EXTRA_CONTEXT,
],
)
doi: str = (
result[AttributeTypes.CROSSREF_DOI][1]
if result[AttributeTypes.CROSSREF_DOI][0]
else None
)
abstract: str = (
result[AttributeTypes.CROSSREF_ABSTRACT][1]
if result[AttributeTypes.CROSSREF_ABSTRACT][0]
else None
)
extra_context: dict = (
result[AttributeTypes.CROSSREF_EXTRA_CONTEXT][1]
if result[AttributeTypes.CROSSREF_EXTRA_CONTEXT][0]
else None
)
return doi, abstract, extra_context
[docs]
def _update_classified_instance_variables(
self, item: Dict, doi: str, abstract: str
) -> None:
"""Updates tracking variables for unclassified items.
Maintains multiple tracking collections for items that couldn't be classified,
ensuring consistent record-keeping across different data structures.
Args:
item (dict): Research metadata dictionary.
Type: Dict[str, Any]
doi (str): DOI identifier.
Type: str | None
abstract (str): Research abstract text.
Type: str | None
Returns:
None
Notes:
- Updates instance variables:
- :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator.unclassified_item_count`
- :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator.unclassified_dois`
- :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator.unclassified_abstracts`
- :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator.unclassified_doi_abstract_dict`
- :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator.unclassified_items`
- :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator.unclassified_details_dict`
- Handles missing values by using "NULL" placeholder
- Maintains parallel data structures for different access patterns
- Preserves original metadata in unclassified items list
- Increments unclassified item counter
"""
self.unclassified_item_count += 1
(
self.unclassified_dois.append(doi)
if doi
else self.unclassified_dois.append("NULL")
)
(
self.unclassified_abstracts.append(abstract)
if abstract
else self.unclassified_abstracts.append("NULL")
)
self.unclassified_doi_abstract_dict[doi] = abstract
self.unclassified_items.append(item)
(
self.unclassified_details_dict["dois"].append(doi)
if doi
else self.unclassified_details_dict["dois"].append("NULL")
)
(
self.unclassified_details_dict["abstracts"].append(abstract)
if abstract
else self.unclassified_details_dict["abstracts"].append("NULL")
)
self.unclassified_details_dict["items"].append(item)
[docs]
def _set_classification_ran_true(self) -> None:
"""Sets the classification ran flag to true.
Updates the internal state to indicate that classification process
has been executed.
Args:
None
Returns:
None
Notes:
- Updates :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator._classification_ran`
- Used for validation checks
- State cannot be reset to false
"""
self._classification_ran: bool = True
[docs]
def _has_ran_classification(self) -> bool:
"""Checks if classification has been run.
Returns
"""
return self._classification_ran
[docs]
def _validate_classification_ran(self, classification_ran: bool) -> None:
"""Checks if classification has been run.
Verifies whether the classification process has been executed by
checking the internal state flag.
Args:
None
Returns:
bool: True if classification has been run, False otherwise.
Type: bool
Notes:
- Reads :attr:`~academic_metrics.orchestrators.classification_orchestrator.ClassificationOrchestrator._classification_ran`
- Used for validation before accessing results
- Cannot detect if classification is currently running
"""
if not classification_ran:
raise RuntimeError(
"Classification has not been run yet. "
"Call run_classification() on your data before attempting to retrieve unclassified attributes. "
"Data should be a list of loaded crossref JSON objects."
)
[docs]
def _normalize_abstract(self, abstract: str) -> str:
"""Normalizes an abstract by removing LaTeX and converting any resulting unicode to ASCII.
Processes research abstract text through two stages:
1. Converts LaTeX notation to unicode text
2. Converts unicode characters to ASCII equivalents
Args:
abstract (str): Research abstract text.
Type: str
May contain LaTeX math notation and unicode characters.
Returns:
str: Normalized abstract text.
Type: str
Contains only ASCII characters.
Notes:
- Uses :class:`~pylatexenc.latex2text.LatexNodes2Text` for LaTeX conversion
- Uses :pypi:`Unidecode` for unicode to ASCII conversion
- Handles mathematical notation
- Preserves text structure
- Removes special characters
- Math mode set to "text" for consistent conversion
"""
converter: LatexNodes2Text = LatexNodes2Text(math_mode="text")
unicode_abstract: str = converter.latex_to_text(abstract)
ascii_abstract: str = unidecode(unicode_abstract)
return ascii_abstract