# -*- coding: utf-8 -*-
"""
Confluence page fetching and processing utilities.
"""
import typing as T
import json
import gzip
from pathlib import Path
from functools import cached_property
from diskcache import Cache
from pydantic import BaseModel, Field, ConfigDict
import pyatlassian.api as pyatlassian
import atlas_doc_parser.api as atlas_doc_parser
from .constants import TAB, ConfluencePageFieldEnum
from .paths import dir_cache
[docs]
class ConfluencePage(BaseModel):
"""
A data container for Confluence pages that enriches the API response data with
hierarchical metadata and navigation properties.
This class wraps the raw page data returned by Confluence's
`get pages <https://developer.atlassian.com/cloud/confluence/rest/v2/api-group-page/#api-pages-get>`_ API
and adds additional attributes for working with page hierarchies and navigation.
:param page_data: The raw item response from the `Confluence.get_pages` API call
:param site_url: Base URL of the Confluence site
:param id_path: Hierarchical ID-based path (e.g., "/parent_id/child_id")
for filtering with glob patterns
:param position_path: Position-based path (e.g., "/1/3/2") used for hierarchical sorting
:param breadcrumb_path: Human-readable title hierarchy (e.g., "|| Parent || Child || Page")
similar to UI breadcrumbs
The class assumes the body format is
`Atlas Doc Format <https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/>`_
Properties like `id`, `title`, `parent_id` provide convenient access to commonly
used attributes from the raw page data.
"""
page_data: dict[str, T.Any] = Field()
site_url: str = Field()
id_path: T.Optional[str] = Field()
position_path: T.Optional[str] = Field()
breadcrumb_path: T.Optional[str] = Field()
@property
def space_id(self) -> str:
return self.page_data["spaceId"]
@property
def id(self) -> str:
return self.page_data["id"]
@property
def parent_id(self) -> str:
return self.page_data["parentId"]
@property
def parent_type(self) -> str:
return self.page_data["parentType"]
@property
def title(self) -> str:
return self.page_data["title"]
@property
def position(self) -> int:
return self.page_data["position"]
@property
def atlas_doc(self) -> dict[str, T.Any]:
return json.loads(self.page_data["body"]["atlas_doc_format"]["value"])
@property
def webui_url(self) -> str:
webui_link = self.page_data["_links"]["webui"]
webui_url = f"{self.site_url}/wiki{webui_link}"
return webui_url
@property
def markdown(self) -> str:
node_doc = atlas_doc_parser.NodeDoc.from_dict(
dct=self.atlas_doc,
ignore_error=True,
)
md_content = node_doc.to_markdown(ignore_error=True)
lines = [
f"# {self.title}",
"",
]
lines.extend(md_content.splitlines())
md_content = "\n".join(lines)
return md_content
[docs]
def to_xml(
self,
wanted_fields: list[str] | None = None,
) -> str:
"""
Serialize the file data to XML format.
This method generates an XML representation of the file including its GitHub
metadata and content, suitable for document storage or AI context input.
"""
if wanted_fields is None:
wanted_fields = [field.value for field in ConfluencePageFieldEnum]
lines = list()
lines.append("<document>")
if ConfluencePageFieldEnum.source_type.value in wanted_fields:
field = ConfluencePageFieldEnum.source_type.value
lines.append(f"{TAB}<{field}>Confluence Page</{field}>")
if ConfluencePageFieldEnum.confluence_url.value in wanted_fields:
field = ConfluencePageFieldEnum.confluence_url.value
lines.append(f"{TAB}<{field}>{self.webui_url}</{field}>")
if ConfluencePageFieldEnum.title.value in wanted_fields:
field = ConfluencePageFieldEnum.title.value
lines.append(f"{TAB}<{field}>{self.title}</{field}>")
# if self.description:
# lines.append(f"{TAB}<description>")
# lines.append(self.description)
# lines.append(f"{TAB}</description>")
if ConfluencePageFieldEnum.markdown_content.value in wanted_fields:
field = ConfluencePageFieldEnum.markdown_content.value
lines.append(f"{TAB}<{field}>")
lines.append(self.markdown)
lines.append(f"{TAB}</{field}>")
lines.append("</document>")
return "\n".join(lines)
def export_to_file(
self,
dir_out: Path,
wanted_fields: list[str] | None = None,
) -> Path:
fname = self.breadcrumb_path[3:].replace("||", "~")
basename = f"{fname}.xml"
path_out = dir_out.joinpath(basename)
content = self.to_xml(wanted_fields=wanted_fields)
try:
path_out.write_text(content, encoding="utf-8")
except FileNotFoundError:
path_out.parent.mkdir(parents=True)
path_out.write_text(content, encoding="utf-8")
return path_out
[docs]
def fetch_raw_pages_from_space(
confluence: pyatlassian.confluence.Confluence,
space_id: int,
) -> list[ConfluencePage]:
"""
Crawls and retrieves all pages from a Confluence space using pagination.
This function fetches raw page data from the Confluence API, converts each page
to a ConfluencePage object with minimal initialization, and returns the complete
collection without processing hierarchical relationships.
:param confluence: Authenticated Confluence API client
:param space_id: ID of the Confluence space to crawl
:returns: List of :class:`ConfluencePage` objects with initialized page_data and site_url,
but without hierarchy information (id_path, position_path, breadcrumb_path)
"""
paginator = confluence.pagi_get_pages(
space_id=[int(space_id)],
body_format="atlas_doc_format",
)
confluence_page_list = list()
for ith, response in enumerate(paginator, start=1):
for page_data in response.get("results", []):
confluence_page = ConfluencePage(
page_data=page_data,
site_url=confluence.url,
id_path=None,
position_path=None,
breadcrumb_path=None,
)
confluence_page_list.append(confluence_page)
return confluence_page_list
[docs]
def enrich_pages_with_hierarchy_data(
raw_pages: list[ConfluencePage],
) -> list[ConfluencePage]:
"""
Enriches Confluence page objects with hierarchical relationship information.
This function processes a list of raw ConfluencePage objects to:
1. Create ID-based paths (id_path) representing the page hierarchy
2. Generate position-based paths (position_path) for correct sorting
3. Build human-readable title hierarchies (breadcrumb_path) for display
The function creates a complete hierarchy tree by iteratively processing pages
for up to 20 levels of depth, starting with parent pages and moving to children.
:param raw_pages: List of :class:`ConfluencePage` objects with basic data but no hierarchy info
:returns: List of :class:`ConfluencePage` objects enriched with hierarchy data and sorted by
their position in the hierarchy
"""
# Create a mapping of page IDs to page objects for quick lookups
id_to_page_mapping: dict[str, ConfluencePage] = {
page.id: page for page in raw_pages
}
# Create a working copy of the mapping to track unprocessed pages
remaining_pages = dict(id_to_page_mapping)
# Limit recursion depth to avoid infinite loops with circular references
max_next_level = 20
# Process pages level by level, starting from root pages
for ith in range(1, 1 + max_next_level):
# print(
# f"=== {ith = }, {len(remaining_pages) = }, {len(id_to_page_mapping) = }"
# )
# Exit if all pages have been processed
if len(remaining_pages) == 0:
break
# Process each remaining page
for id, page in list(remaining_pages.items()):
# Process root pages (no parent or parent outside our space)
if page.parent_id is None:
# Create hierarchy paths for root pages
path = f"/{page.id}"
sort_key = f"/{page.position}"
title_chain = f"|| {page.title}"
page.id_path = path
page.position_path = sort_key
page.breadcrumb_path = title_chain
# Remove from remaining pages as it's now processed
remaining_pages.pop(page.id)
# Process child pages
else:
# Check if the parent page is in our collection
if page.parent_id in id_to_page_mapping:
parent_page = id_to_page_mapping[page.parent_id]
# Skip if parent's paths aren't set yet (will process in later iteration)
if parent_page.id_path is None:
continue
# Create hierarchy paths based on parent's paths
page.id_path = f"{parent_page.id_path}/{id}"
page.position_path = f"{parent_page.position_path}/{page.position}"
page.breadcrumb_path = (
f"{parent_page.breadcrumb_path} || {page.title}"
)
# Remove from remaining pages as it's now processed
remaining_pages.pop(id)
# Handle pages with parents outside our scope (typically Confluence folders)
else:
# Remove these pages from both mappings as they can't be processed
remaining_pages.pop(id)
id_to_page_mapping.pop(id)
# Sort pages based on their positions in the hierarchy
sorted_pages = list(
sorted(
id_to_page_mapping.values(),
key=lambda page: page.position_path,
)
)
return sorted_pages
[docs]
def load_or_build_page_hierarchy(
confluence: pyatlassian.confluence.Confluence,
space_id: int,
cache: Cache,
cache_key: str,
expire: int = 24 * 60 * 60,
) -> list[ConfluencePage]:
"""
Retrieves a complete Confluence page hierarchy with caching support.
This function either:
1. Returns a cached page hierarchy if available
2. Or fetches pages, builds their hierarchy, and caches the result
The function uses a composite cache key consisting of the Confluence URL,
space ID, and provided cache key to ensure proper cache isolation.
Results are compressed with gzip before caching to reduce storage usage.
:param confluence: Authenticated Confluence API client
:param space_id: ID of the Confluence space to crawl
:param cache_key: Additional key component for cache differentiation
(e.g., to cache different point-in-time snapshot of the same space)
:returns: List of :class:`ConfluencePage` objects with complete hierarchy data,
sorted by their hierarchical position
"""
real_cache_key = (confluence.url, space_id, cache_key)
# print(f"{real_cache_key = }") # for debug only
if real_cache_key in cache: # pragma: no cover
print("Hit cache!") # for debug only
cache_value = cache[real_cache_key]
data = json.loads(gzip.decompress(cache_value).decode("utf-8"))
sorted_pages = [ConfluencePage(**page_data) for page_data in data]
return sorted_pages
else:
raw_pages = fetch_raw_pages_from_space(
confluence=confluence,
space_id=space_id,
)
sorted_pages = enrich_pages_with_hierarchy_data(raw_pages=raw_pages)
data = [page.model_dump() for page in sorted_pages]
cache_value = gzip.compress(
json.dumps(data, ensure_ascii=False).encode("utf-8")
)
cache.set(real_cache_key, cache_value, expire=expire)
return sorted_pages
[docs]
def process_include_exclude(
include: list[str],
exclude: list[str],
) -> tuple[list[str], list[str]]:
"""
Process include and exclude patterns for Confluence page IDs or URLs.
This function takes lists of include and exclude patterns that might be
Confluence page URLs or IDs, extracts the page IDs from them, and preserves
any trailing wildcards (/*). It normalizes all inputs to a consistent format
of either just the ID or ID with wildcard.
:param include: List of Confluence page URLs or IDs to include
Items can be full URLs, page IDs, or patterns with /* suffix
:param exclude: List of Confluence page URLs or IDs to exclude
Items can be full URLs, page IDs, or patterns with /* suffix
:return: A tuple of two lists:
1. Normalized include patterns with extracted IDs
2. Normalized exclude patterns with extracted IDs
"""
new_include, new_exclude = list(), list()
for expr in include:
id = extract_id(expr)
if expr.endswith("/*"):
new_include.append(id + "/*")
else:
new_include.append(id)
for expr in exclude:
id = extract_id(expr)
if expr.endswith("/*"):
new_exclude.append(id + "/*")
else:
new_exclude.append(id)
return new_include, new_exclude
[docs]
def is_matching(
page_mapping: dict[str, ConfluencePage],
page: ConfluencePage,
include: T.List[str],
exclude: T.List[str],
) -> bool:
"""
Determine if a Confluence page matches the include/exclude filtering criteria.
This function implements the filtering logic similar to gitignore patterns, where:
- A page is included if it matches any include pattern
- A page is excluded if it matches any exclude pattern
- Patterns with /* suffix match the specified page and all its descendants
- If no include patterns are provided, all pages are initially included (before exclusions)
:param page_mapping: Dictionary mapping page IDs to their ConfluencePage objects
for efficient parent-child relationship lookups
:param page: The ConfluencePage object to check against the filters
:param include: List of normalized page IDs or page ID patterns (with /* suffix)
to include in results. This is a processed "include" list from process_include_exclude()
:param exclude: List of normalized page IDs or page ID patterns (with /* suffix)
to exclude from results. This is a processed "exclude" list from process_include_exclude()
:return: True if the page should be included in the results, False otherwise
"""
# Process include patterns - a page must match at least one include pattern to be considered
if len(include):
include_flag = False
for expr in include:
if expr.endswith("/*"):
# This is a hierarchical include pattern (folder and all children)
parent_id = expr.rstrip("/*")
if parent_id in page_mapping:
parent_page = page_mapping[parent_id]
# Check if current page is a descendant of the specified parent
if page.id_path.startswith(parent_page.id_path):
include_flag = True
break
elif page.id == expr.rstrip("/*"):
# Direct page ID match
include_flag = True
break
else:
# No include patterns specified - include all pages by default
include_flag = True
# If page didn't match any include patterns, exclude it
if include_flag is False:
return False
# Process exclude patterns - a page matching any exclude pattern is filtered out
for expr in exclude:
if expr.endswith("/*"):
# This is a hierarchical exclude pattern (folder and all children)
parent_id = expr.rstrip("/*")
if parent_id in page_mapping:
parent_page = page_mapping[parent_id]
# Check if current page is a descendant of the excluded parent
if page.id_path.startswith(parent_page.id_path):
return False
elif page.id == expr.rstrip("/*"):
# Direct page ID match for exclusion
return False
# Page didn't hit all exclude filter criteria
return True
[docs]
def find_matching_pages(
sorted_pages: list[ConfluencePage],
include: T.List[str],
exclude: T.List[str],
):
"""
Filter Confluence pages based on include/exclude patterns similar to gitignore.
This function lets you specify which pages to include or exclude using either
direct page IDs or hierarchical patterns. It supports URL or ID formats and
allows using /* suffix to indicate a page and all its descendants (like a folder).
Filtering logic follows these rules:
1. First, normalize all URL or ID patterns to a consistent format
2. Pages matching any include pattern are considered (or all if no include patterns)
3. Then, any page matching an exclude pattern is filtered out
4. Patterns with /* match the specified page and all its descendants
:param sorted_pages: List of :class:`ConfluencePage` objects sorted by hierarchy
(typically from `enrich_pages_with_hierarchy_data`)
:param include: List of Confluence page URLs or IDs to include
Can be full URLs, page IDs, or patterns with /* suffix
:param exclude: List of Confluence page URLs or IDs to exclude
Can be full URLs, page IDs, or patterns with /* suffix
:return: Filtered list of :class:`ConfluencePage` objects that match the criteria
"""
page_mapping = {page.id: page for page in sorted_pages}
matched_pages = list()
new_include, new_exclude = process_include_exclude(include, exclude)
for page in sorted_pages:
flag = is_matching(
page_mapping=page_mapping,
page=page,
include=new_include,
exclude=new_exclude,
)
if flag:
matched_pages.append(page)
return matched_pages
[docs]
class ConfluencePipeline(BaseModel):
"""
A data pipeline that extracts and synchronizes Confluence pages to a target location.
ConfluencePipeline provides an abstraction for defining a Confluence space source and
filtering criteria, then exporting the matching pages to a specified output directory
as structured XML documents that preserve both content and metadata.
The pipeline handles the complete workflow from authentication to content extraction,
hierarchical processing, filtering, and file export with metadata preservation.
Example:
.. code-block:: python
confluence_pipeline = ConfluencePipeline(
confluence=confluence,
space_id=space_id,
# Use cache key to avoid re-fetching the same page hierarchy
# it will store all pages in the cache and use it for filtering
# if you change the include / exclude pattern
cache_key=cache_key,
include=[
# include all child page
f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}/*",
# only include this page, no child page
f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}",
],
exclude=[
# exclude all child page
f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}/*",
# only exclude this page, no child page
f"{confluence.url}/wiki/spaces/{space_key}/pages/{page_id}/{page_title}",
],
)
:param confluence: Authenticated Confluence API client instance
:param space_id: space ID (int) or space key (str) of the Confluence space to process
:param include: List of patterns (URLs or IDs) specifying which pages to include.
Use Page URL + ``/*`` to include all children of a page.
:param exclude: List of patterns (URLs or IDs) specifying which pages to exclude
Use Page URL + ``/*`` to include all children of a page.
:param dir_out: The directory where the XML files should be exported
:param cache_key: Key for caching and retrieving page hierarchies
:param cache_expire: Cache expiration time in seconds (default: 24 hours)
"""
model_config = ConfigDict(
arbitrary_types_allowed=True,
)
confluence: "pyatlassian.confluence.Confluence" = Field()
space_id: int | str = Field()
include: list[str] = Field()
exclude: list[str] = Field()
dir_out: Path = Field()
cache_key: str = Field()
cache_expire: int = Field(default=24 * 60 * 60)
cache_path: str = Field(default=str(dir_cache))
wanted_fields: list[str] | None = Field(default=None)
@cached_property
def _space_id(self) -> int:
"""
Get the space ID from the provided space_id.
"""
if isinstance(self.space_id, str):
res = self.confluence.get_spaces(
keys=[self.space_id],
)
space_id = None
for dct in res.get("results", []):
if dct.get("key") == self.space_id:
space_id = int(dct["id"])
return space_id
if space_id is None: # pragma: no cover
raise ValueError("Space not found")
else:
return self.space_id
@cached_property
def cache(self) -> Cache:
return Cache(self.cache_path)
[docs]
def post_process_confluence_page(
self,
confluence_page: ConfluencePage,
) -> ConfluencePage:
"""
Post-process the ConfluencePage object after fetching it.
User can override this method to add custom processing logic
"""
return confluence_page
[docs]
def post_process_path_out(
self,
confluence_page: ConfluencePage,
path_out: Path,
):
"""
Post-process the output path after exporting a Confluence page.
"""
pass
[docs]
def fetch(self):
"""
Execute the pipeline to extract and export Confluence pages to the target directory.
This method performs the complete workflow:
1. List all pages in the given Confluence space that match the include/exclude patterns
2. Converts each page to a ConfluencePage object with metadata
3. Exports each page as an XML document to the specified output directory
"""
sorted_pages = load_or_build_page_hierarchy(
confluence=self.confluence,
space_id=self._space_id,
cache_key=self.cache_key,
cache=self.cache,
)
matched_pages = find_matching_pages(
sorted_pages=sorted_pages,
include=self.include,
exclude=self.exclude,
)
for page in matched_pages:
page = self.post_process_confluence_page(page)
path_out = page.export_to_file(
dir_out=self.dir_out, wanted_fields=self.wanted_fields
)
self.post_process_path_out(confluence_page=page, path_out=path_out)