Source code for docpack.confluence_fetcher

# -*- coding: utf-8 -*-

"""
Confluence page fetching and processing utilities.
"""

import typing as T
import json
import gzip
from pathlib import Path
from functools import cached_property

from diskcache import Cache
from pydantic import BaseModel, Field, ConfigDict
import pyatlassian.api as pyatlassian
import atlas_doc_parser.api as atlas_doc_parser

from .constants import TAB, ConfluencePageFieldEnum
from .paths import dir_cache


[docs] class ConfluencePage(BaseModel): """ A data container for Confluence pages that enriches the API response data with hierarchical metadata and navigation properties. This class wraps the raw page data returned by Confluence's `get pages <https://developer.atlassian.com/cloud/confluence/rest/v2/api-group-page/#api-pages-get>`_ API and adds additional attributes for working with page hierarchies and navigation. :param page_data: The raw item response from the `Confluence.get_pages` API call :param site_url: Base URL of the Confluence site :param id_path: Hierarchical ID-based path (e.g., "/parent_id/child_id") for filtering with glob patterns :param position_path: Position-based path (e.g., "/1/3/2") used for hierarchical sorting :param breadcrumb_path: Human-readable title hierarchy (e.g., "|| Parent || Child || Page") similar to UI breadcrumbs The class assumes the body format is `Atlas Doc Format <https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/>`_ Properties like `id`, `title`, `parent_id` provide convenient access to commonly used attributes from the raw page data. """ page_data: dict[str, T.Any] = Field() site_url: str = Field() id_path: T.Optional[str] = Field() position_path: T.Optional[str] = Field() breadcrumb_path: T.Optional[str] = Field() @property def space_id(self) -> str: return self.page_data["spaceId"] @property def id(self) -> str: return self.page_data["id"] @property def parent_id(self) -> str: return self.page_data["parentId"] @property def parent_type(self) -> str: return self.page_data["parentType"] @property def title(self) -> str: return self.page_data["title"] @property def position(self) -> int: return self.page_data["position"] @property def atlas_doc(self) -> dict[str, T.Any]: return json.loads(self.page_data["body"]["atlas_doc_format"]["value"]) @property def webui_url(self) -> str: webui_link = self.page_data["_links"]["webui"] webui_url = f"{self.site_url}/wiki{webui_link}" return webui_url @property def markdown(self) -> str: node_doc = atlas_doc_parser.NodeDoc.from_dict( dct=self.atlas_doc, ignore_error=True, ) md_content = node_doc.to_markdown(ignore_error=True) lines = [ f"# {self.title}", "", ] lines.extend(md_content.splitlines()) md_content = "\n".join(lines) return md_content
[docs] def to_xml( self, wanted_fields: list[str] | None = None, ) -> str: """ Serialize the file data to XML format. This method generates an XML representation of the file including its GitHub metadata and content, suitable for document storage or AI context input. """ if wanted_fields is None: wanted_fields = [field.value for field in ConfluencePageFieldEnum] lines = list() lines.append("<document>") if ConfluencePageFieldEnum.source_type.value in wanted_fields: field = ConfluencePageFieldEnum.source_type.value lines.append(f"{TAB}<{field}>Confluence Page</{field}>") if ConfluencePageFieldEnum.confluence_url.value in wanted_fields: field = ConfluencePageFieldEnum.confluence_url.value lines.append(f"{TAB}<{field}>{self.webui_url}</{field}>") if ConfluencePageFieldEnum.title.value in wanted_fields: field = ConfluencePageFieldEnum.title.value lines.append(f"{TAB}<{field}>{self.title}</{field}>") # if self.description: # lines.append(f"{TAB}<description>") # lines.append(self.description) # lines.append(f"{TAB}</description>") if ConfluencePageFieldEnum.markdown_content.value in wanted_fields: field = ConfluencePageFieldEnum.markdown_content.value lines.append(f"{TAB}<{field}>") lines.append(self.markdown) lines.append(f"{TAB}</{field}>") lines.append("</document>") return "\n".join(lines)
def export_to_file( self, dir_out: Path, wanted_fields: list[str] | None = None, ) -> Path: fname = self.breadcrumb_path[3:].replace("||", "~") basename = f"{fname}.xml" path_out = dir_out.joinpath(basename) content = self.to_xml(wanted_fields=wanted_fields) try: path_out.write_text(content, encoding="utf-8") except FileNotFoundError: path_out.parent.mkdir(parents=True) path_out.write_text(content, encoding="utf-8") return path_out
[docs] def fetch_raw_pages_from_space( confluence: pyatlassian.confluence.Confluence, space_id: int, ) -> list[ConfluencePage]: """ Crawls and retrieves all pages from a Confluence space using pagination. This function fetches raw page data from the Confluence API, converts each page to a ConfluencePage object with minimal initialization, and returns the complete collection without processing hierarchical relationships. :param confluence: Authenticated Confluence API client :param space_id: ID of the Confluence space to crawl :returns: List of :class:`ConfluencePage` objects with initialized page_data and site_url, but without hierarchy information (id_path, position_path, breadcrumb_path) """ paginator = confluence.pagi_get_pages( space_id=[int(space_id)], body_format="atlas_doc_format", ) confluence_page_list = list() for ith, response in enumerate(paginator, start=1): for page_data in response.get("results", []): confluence_page = ConfluencePage( page_data=page_data, site_url=confluence.url, id_path=None, position_path=None, breadcrumb_path=None, ) confluence_page_list.append(confluence_page) return confluence_page_list
[docs] def enrich_pages_with_hierarchy_data( raw_pages: list[ConfluencePage], ) -> list[ConfluencePage]: """ Enriches Confluence page objects with hierarchical relationship information. This function processes a list of raw ConfluencePage objects to: 1. Create ID-based paths (id_path) representing the page hierarchy 2. Generate position-based paths (position_path) for correct sorting 3. Build human-readable title hierarchies (breadcrumb_path) for display The function creates a complete hierarchy tree by iteratively processing pages for up to 20 levels of depth, starting with parent pages and moving to children. :param raw_pages: List of :class:`ConfluencePage` objects with basic data but no hierarchy info :returns: List of :class:`ConfluencePage` objects enriched with hierarchy data and sorted by their position in the hierarchy """ # Create a mapping of page IDs to page objects for quick lookups id_to_page_mapping: dict[str, ConfluencePage] = { page.id: page for page in raw_pages } # Create a working copy of the mapping to track unprocessed pages remaining_pages = dict(id_to_page_mapping) # Limit recursion depth to avoid infinite loops with circular references max_next_level = 20 # Process pages level by level, starting from root pages for ith in range(1, 1 + max_next_level): # print( # f"=== {ith = }, {len(remaining_pages) = }, {len(id_to_page_mapping) = }" # ) # Exit if all pages have been processed if len(remaining_pages) == 0: break # Process each remaining page for id, page in list(remaining_pages.items()): # Process root pages (no parent or parent outside our space) if page.parent_id is None: # Create hierarchy paths for root pages path = f"/{page.id}" sort_key = f"/{page.position}" title_chain = f"|| {page.title}" page.id_path = path page.position_path = sort_key page.breadcrumb_path = title_chain # Remove from remaining pages as it's now processed remaining_pages.pop(page.id) # Process child pages else: # Check if the parent page is in our collection if page.parent_id in id_to_page_mapping: parent_page = id_to_page_mapping[page.parent_id] # Skip if parent's paths aren't set yet (will process in later iteration) if parent_page.id_path is None: continue # Create hierarchy paths based on parent's paths page.id_path = f"{parent_page.id_path}/{id}" page.position_path = f"{parent_page.position_path}/{page.position}" page.breadcrumb_path = ( f"{parent_page.breadcrumb_path} || {page.title}" ) # Remove from remaining pages as it's now processed remaining_pages.pop(id) # Handle pages with parents outside our scope (typically Confluence folders) else: # Remove these pages from both mappings as they can't be processed remaining_pages.pop(id) id_to_page_mapping.pop(id) # Sort pages based on their positions in the hierarchy sorted_pages = list( sorted( id_to_page_mapping.values(), key=lambda page: page.position_path, ) ) return sorted_pages
[docs] def load_or_build_page_hierarchy( confluence: pyatlassian.confluence.Confluence, space_id: int, cache: Cache, cache_key: str, expire: int = 24 * 60 * 60, ) -> list[ConfluencePage]: """ Retrieves a complete Confluence page hierarchy with caching support. This function either: 1. Returns a cached page hierarchy if available 2. Or fetches pages, builds their hierarchy, and caches the result The function uses a composite cache key consisting of the Confluence URL, space ID, and provided cache key to ensure proper cache isolation. Results are compressed with gzip before caching to reduce storage usage. :param confluence: Authenticated Confluence API client :param space_id: ID of the Confluence space to crawl :param cache_key: Additional key component for cache differentiation (e.g., to cache different point-in-time snapshot of the same space) :returns: List of :class:`ConfluencePage` objects with complete hierarchy data, sorted by their hierarchical position """ real_cache_key = (confluence.url, space_id, cache_key) # print(f"{real_cache_key = }") # for debug only if real_cache_key in cache: # pragma: no cover print("Hit cache!") # for debug only cache_value = cache[real_cache_key] data = json.loads(gzip.decompress(cache_value).decode("utf-8")) sorted_pages = [ConfluencePage(**page_data) for page_data in data] return sorted_pages else: raw_pages = fetch_raw_pages_from_space( confluence=confluence, space_id=space_id, ) sorted_pages = enrich_pages_with_hierarchy_data(raw_pages=raw_pages) data = [page.model_dump() for page in sorted_pages] cache_value = gzip.compress( json.dumps(data, ensure_ascii=False).encode("utf-8") ) cache.set(real_cache_key, cache_value, expire=expire) return sorted_pages
[docs] def extract_id(url_or_id: str) -> str: """ Extract the page ID from a Confluence URL or return the ID if directly provided. This function handles different Confluence URL formats and extracts the page ID. It also handles cases where the URL has a trailing /* or when just the ID is provided. :param url_or_id: A Confluence page URL or direct page ID. Example: "https://example.atlassian.net/wiki/spaces/BD/pages/123456/Value+Proposition" or just "123456" :return: The extracted page ID as a string """ # If it's just an ID (possibly with /* at the end) if "/" not in url_or_id or url_or_id.count("/") == 1 and url_or_id.endswith("/*"): # Remove /* if present return url_or_id.rstrip("/*") # It's a URL, extract the ID which comes after /pages/ segment parts = url_or_id.split("/pages/") if len(parts) != 2: raise ValueError(f"Invalid Confluence URL format: {url_or_id}") # The ID is the segment after /pages/ and before the next / id_and_title = parts[1].split("/", 1) return id_and_title[0]
[docs] def process_include_exclude( include: list[str], exclude: list[str], ) -> tuple[list[str], list[str]]: """ Process include and exclude patterns for Confluence page IDs or URLs. This function takes lists of include and exclude patterns that might be Confluence page URLs or IDs, extracts the page IDs from them, and preserves any trailing wildcards (/*). It normalizes all inputs to a consistent format of either just the ID or ID with wildcard. :param include: List of Confluence page URLs or IDs to include Items can be full URLs, page IDs, or patterns with /* suffix :param exclude: List of Confluence page URLs or IDs to exclude Items can be full URLs, page IDs, or patterns with /* suffix :return: A tuple of two lists: 1. Normalized include patterns with extracted IDs 2. Normalized exclude patterns with extracted IDs """ new_include, new_exclude = list(), list() for expr in include: id = extract_id(expr) if expr.endswith("/*"): new_include.append(id + "/*") else: new_include.append(id) for expr in exclude: id = extract_id(expr) if expr.endswith("/*"): new_exclude.append(id + "/*") else: new_exclude.append(id) return new_include, new_exclude
[docs] def is_matching( page_mapping: dict[str, ConfluencePage], page: ConfluencePage, include: T.List[str], exclude: T.List[str], ) -> bool: """ Determine if a Confluence page matches the include/exclude filtering criteria. This function implements the filtering logic similar to gitignore patterns, where: - A page is included if it matches any include pattern - A page is excluded if it matches any exclude pattern - Patterns with /* suffix match the specified page and all its descendants - If no include patterns are provided, all pages are initially included (before exclusions) :param page_mapping: Dictionary mapping page IDs to their ConfluencePage objects for efficient parent-child relationship lookups :param page: The ConfluencePage object to check against the filters :param include: List of normalized page IDs or page ID patterns (with /* suffix) to include in results. This is a processed "include" list from process_include_exclude() :param exclude: List of normalized page IDs or page ID patterns (with /* suffix) to exclude from results. This is a processed "exclude" list from process_include_exclude() :return: True if the page should be included in the results, False otherwise """ # Process include patterns - a page must match at least one include pattern to be considered if len(include): include_flag = False for expr in include: if expr.endswith("/*"): # This is a hierarchical include pattern (folder and all children) parent_id = expr.rstrip("/*") if parent_id in page_mapping: parent_page = page_mapping[parent_id] # Check if current page is a descendant of the specified parent if page.id_path.startswith(parent_page.id_path): include_flag = True break elif page.id == expr.rstrip("/*"): # Direct page ID match include_flag = True break else: # No include patterns specified - include all pages by default include_flag = True # If page didn't match any include patterns, exclude it if include_flag is False: return False # Process exclude patterns - a page matching any exclude pattern is filtered out for expr in exclude: if expr.endswith("/*"): # This is a hierarchical exclude pattern (folder and all children) parent_id = expr.rstrip("/*") if parent_id in page_mapping: parent_page = page_mapping[parent_id] # Check if current page is a descendant of the excluded parent if page.id_path.startswith(parent_page.id_path): return False elif page.id == expr.rstrip("/*"): # Direct page ID match for exclusion return False # Page didn't hit all exclude filter criteria return True
[docs] def find_matching_pages( sorted_pages: list[ConfluencePage], include: T.List[str], exclude: T.List[str], ): """ Filter Confluence pages based on include/exclude patterns similar to gitignore. This function lets you specify which pages to include or exclude using either direct page IDs or hierarchical patterns. It supports URL or ID formats and allows using /* suffix to indicate a page and all its descendants (like a folder). Filtering logic follows these rules: 1. First, normalize all URL or ID patterns to a consistent format 2. Pages matching any include pattern are considered (or all if no include patterns) 3. Then, any page matching an exclude pattern is filtered out 4. Patterns with /* match the specified page and all its descendants :param sorted_pages: List of :class:`ConfluencePage` objects sorted by hierarchy (typically from `enrich_pages_with_hierarchy_data`) :param include: List of Confluence page URLs or IDs to include Can be full URLs, page IDs, or patterns with /* suffix :param exclude: List of Confluence page URLs or IDs to exclude Can be full URLs, page IDs, or patterns with /* suffix :return: Filtered list of :class:`ConfluencePage` objects that match the criteria """ page_mapping = {page.id: page for page in sorted_pages} matched_pages = list() new_include, new_exclude = process_include_exclude(include, exclude) for page in sorted_pages: flag = is_matching( page_mapping=page_mapping, page=page, include=new_include, exclude=new_exclude, ) if flag: matched_pages.append(page) return matched_pages
[docs] class ConfluencePipeline(BaseModel): """ A data pipeline that extracts and synchronizes Confluence pages to a target location. ConfluencePipeline provides an abstraction for defining a Confluence space source and filtering criteria, then exporting the matching pages to a specified output directory as structured XML documents that preserve both content and metadata. The pipeline handles the complete workflow from authentication to content extraction, hierarchical processing, filtering, and file export with metadata preservation. Example: .. code-block:: python confluence_pipeline = ConfluencePipeline( confluence=confluence, space_id=space_id, # Use cache key to avoid re-fetching the same page hierarchy # it will store all pages in the cache and use it for filtering # if you change the include / exclude pattern cache_key=cache_key, include=[ # include all child page f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}/*", # only include this page, no child page f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}", ], exclude=[ # exclude all child page f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}/*", # only exclude this page, no child page f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}", ], ) :param confluence: Authenticated Confluence API client instance :param space_id: space ID (int) or space key (str) of the Confluence space to process :param include: List of patterns (URLs or IDs) specifying which pages to include. Use Page URL + ``/*`` to include all children of a page. :param exclude: List of patterns (URLs or IDs) specifying which pages to exclude Use Page URL + ``/*`` to include all children of a page. :param dir_out: The directory where the XML files should be exported :param cache_key: Key for caching and retrieving page hierarchies :param cache_expire: Cache expiration time in seconds (default: 24 hours) """ model_config = ConfigDict( arbitrary_types_allowed=True, ) confluence: "pyatlassian.confluence.Confluence" = Field() space_id: int | str = Field() include: list[str] = Field() exclude: list[str] = Field() dir_out: Path = Field() cache_key: str = Field() cache_expire: int = Field(default=24 * 60 * 60) cache_path: str = Field(default=str(dir_cache)) wanted_fields: list[str] | None = Field(default=None) @cached_property def _space_id(self) -> int: """ Get the space ID from the provided space_id. """ if isinstance(self.space_id, str): res = self.confluence.get_spaces( keys=[self.space_id], ) space_id = None for dct in res.get("results", []): if dct.get("key") == self.space_id: space_id = int(dct["id"]) return space_id if space_id is None: # pragma: no cover raise ValueError("Space not found") else: return self.space_id @cached_property def cache(self) -> Cache: return Cache(self.cache_path)
[docs] def post_process_confluence_page( self, confluence_page: ConfluencePage, ) -> ConfluencePage: """ Post-process the ConfluencePage object after fetching it. User can override this method to add custom processing logic """ return confluence_page
[docs] def post_process_path_out( self, confluence_page: ConfluencePage, path_out: Path, ): """ Post-process the output path after exporting a Confluence page. """ pass
[docs] def fetch(self): """ Execute the pipeline to extract and export Confluence pages to the target directory. This method performs the complete workflow: 1. List all pages in the given Confluence space that match the include/exclude patterns 2. Converts each page to a ConfluencePage object with metadata 3. Exports each page as an XML document to the specified output directory """ sorted_pages = load_or_build_page_hierarchy( confluence=self.confluence, space_id=self._space_id, cache_key=self.cache_key, cache=self.cache, ) matched_pages = find_matching_pages( sorted_pages=sorted_pages, include=self.include, exclude=self.exclude, ) for page in matched_pages: page = self.post_process_confluence_page(page) path_out = page.export_to_file( dir_out=self.dir_out, wanted_fields=self.wanted_fields ) self.post_process_path_out(confluence_page=page, path_out=path_out)