Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Block-level copying of Jinja content between split projects #192

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dbt_meshify/change.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class Operation(str, Enum):
"""An operation describes the type of work being performed."""

Add = "add"
Append = "append"
Update = "update"
Remove = "remove"
Copy = "copy"
Expand All @@ -17,6 +18,7 @@ class Operation(str, Enum):

prepositions = {
Operation.Add: "to",
Operation.Append: "to",
Operation.Move: "to",
Operation.Copy: "to",
Operation.Update: "in",
Expand Down
60 changes: 60 additions & 0 deletions dbt_meshify/dbt_projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from dbt_meshify.dbt import Dbt
from dbt_meshify.exceptions import FatalMeshifyException
from dbt_meshify.storage.jinja_blocks import JinjaBlock, find_doc_reference


class BaseDbtProject:
Expand Down Expand Up @@ -310,6 +311,30 @@ def __init__(
resources = self.select_resources(output_key="unique_id")

super().__init__(manifest, project, catalog, name, resources)
self.jinja_blocks: Dict[str, JinjaBlock] = self.find_jinja_blocks()

def find_jinja_blocks(self) -> Dict[str, JinjaBlock]:
"""For a given dbt Project, find all Jinja blocks for docs and macros"""

blocks = {}

for unique_id, item in self.manifest.docs.items():
if item.package_name != self.name:
continue

blocks[unique_id] = JinjaBlock.from_file(
path=self.path / item.original_file_path, block_type="docs", name=item.name
)

for unique_id, macro in self.manifest.macros.items():
if macro.package_name != self.name:
continue

blocks[unique_id] = JinjaBlock.from_file(
path=self.path / macro.original_file_path, block_type="macro", name=macro.name
)

return blocks

def select_resources(
self,
Expand Down Expand Up @@ -396,6 +421,8 @@ def __init__(
self.groups = self._get_indirect_groups()
self._rename_project()

self._referenced_docs: Optional[Set[str]] = None

def _rename_project(self) -> None:
"""
edits the project yml to take any instance of the parent project name and update it to the subproject name
Expand Down Expand Up @@ -432,6 +459,39 @@ def _get_custom_macros(self) -> Set[str]:
macros_set.update(self._get_macro_dependencies(macro))
return macros_set

@property
def referenced_docs(self) -> Set[str]:
"""Return a list of all docs referenced within this SubProject."""

if self._referenced_docs:
return self._referenced_docs

docs = set()
for unique_id in self.resources:
if unique_id.startswith("test."):
continue

node = self.get_manifest_node(unique_id)

if node is None:
nicholasyager marked this conversation as resolved.
Show resolved Hide resolved
continue

if hasattr(node, "raw_code"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forgetting which nodes have raw_code -- does this indicate we'd be searching the content of a model for docs blocks? are we looking for docs in the definition of YML only resources in this step?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you've figured this out already, but yes. This checks raw_code, and the next check on line 482 checks wihtin the YAML file, too.

docs.update(find_doc_reference(node.raw_code))

if hasattr(node, "patch_path"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah i think this answers my above question

path = self.parent_project.resolve_patch_path(node)
if path.exists():
with open(path) as file:
docs.update(find_doc_reference(file.read()))

# Use the search name for the doc to resolve a unique_id for the doc resource.
self._referenced_docs = {
unique_id for unique_id, doc in self.manifest.docs.items() if doc.name in docs
}

return self._referenced_docs

def _get_indirect_groups(self) -> Set[str]:
"""
get a set of group unique_ids for all the selected resources
Expand Down
3 changes: 2 additions & 1 deletion dbt_meshify/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ def split(
change_set = subproject_creator.initialize()

return [change_set]
except Exception:
except Exception as e:
logger.exception(e) # TODO: Remove this line!
raise FatalMeshifyException(f"Error creating subproject {subproject.name}")


Expand Down
33 changes: 30 additions & 3 deletions dbt_meshify/storage/dbt_project_editors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from dbt_meshify.dbt_projects import DbtSubProject
from dbt_meshify.storage.file_content_editors import NamedList, filter_empty_dict_items
from dbt_meshify.storage.file_manager import YAMLFileManager, yaml
from dbt_meshify.storage.jinja_blocks import JinjaBlock
from dbt_meshify.utilities.contractor import Contractor
from dbt_meshify.utilities.dependencies import DependenciesUpdater
from dbt_meshify.utilities.grouper import ResourceGrouper
Expand Down Expand Up @@ -141,7 +142,12 @@ def initialize(self) -> ChangeSet:
f"Identifying operations required to split {subproject.name} from {subproject.parent_project.name}."
)

for unique_id in subproject.resources | subproject.custom_macros | subproject.groups:
for unique_id in (
subproject.resources
| subproject.custom_macros
| subproject.groups
| subproject.referenced_docs
):
resource = subproject.get_manifest_node(unique_id)
if not resource:
raise KeyError(f"Resource {unique_id} not found in manifest")
Expand Down Expand Up @@ -182,10 +188,20 @@ def initialize(self) -> ChangeSet:
):
change_set.extend(reference_updater.update_parent_refs(resource))

elif resource.resource_type in ["macro", "group"]:
elif resource.resource_type in ["macro", "group", "doc"]:
if hasattr(resource, "patch_path") and resource.patch_path:
change_set.add(self.copy_resource_yml(resource))
change_set.add(self.copy_resource(resource))

if resource.unique_id in self.subproject.parent_project.jinja_blocks:
change_set.add(
self.copy_jinja_block(
resource,
self.subproject.parent_project.jinja_blocks[resource.unique_id],
)
)

else:
change_set.add(self.copy_resource(resource))

else:
logger.debug(
Expand Down Expand Up @@ -237,6 +253,17 @@ def move_resource(self, resource: Resource) -> FileChange:
source=self.subproject.parent_project.resolve_file_path(resource),
)

def copy_jinja_block(self, resource: Resource, jinja_block: JinjaBlock) -> FileChange:
"""Move an existing jinja block to a new project"""

return FileChange(
operation=Operation.Append,
entity_type=EntityType.Code,
identifier=resource.name,
path=self.subproject.resolve_file_path(resource),
data=jinja_block.content,
)

def copy_resource(self, resource: Resource) -> FileChange:
"""
Copy a resource file from one project to another
Expand Down
12 changes: 12 additions & 0 deletions dbt_meshify/storage/file_content_editors.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ def add(change: FileChange):

RawFileManager.write_file(path=change.path, content=change.data)

@staticmethod
def append(change: FileChange):
"""Append data to an existing file."""

if not change.path.parent.exists():
change.path.parent.mkdir(parents=True, exist_ok=True)

if change.data is None:
return RawFileManager.touch_file(change.path)

RawFileManager.append_file(path=change.path, content=change.data)

@staticmethod
def update(change: FileChange):
"""Update data to a new file."""
Expand Down
6 changes: 6 additions & 0 deletions dbt_meshify/storage/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ def write_file(path: Path, content: str) -> None:
"""Write a string value to a file in the filesystem"""
path.write_text(content)

@staticmethod
def append_file(path: Path, content: str) -> None:
"""Append a string value to a file in the filesystem"""
with open(path, "a") as file:
file.write(content)

@staticmethod
def copy_file(source_path: Path, target_path: Path) -> None:
if not target_path.parent.exists():
Expand Down
75 changes: 75 additions & 0 deletions dbt_meshify/storage/jinja_blocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Set, Tuple


@dataclass
class JinjaBlock:
"""
A data structure for tracking Jinja blocks of text. Includes the start and end character positions, and the content of the block
"""

path: Path
block_type: str
name: str
start: int
end: int
content: str

@staticmethod
def find_block_range(file_content: str, block_type: str, name: str) -> Tuple[int, int]:
"""Find the line number that a block started."""
start_line = None
end_line = None

for match in re.finditer(
r"{%-?\s+" + block_type + r"\s+" + name + r"([(a-zA-Z0-9=,_ )]*)\s-?%}",
file_content,
re.MULTILINE,
):
start = match.span()[0] # .span() gives tuple (start, end)
start_line = start # file_content[:start].count("\n")
break

if start_line is None:
raise Exception(f"Unable to find a {block_type} block with the name {name}.")

for match in re.finditer(
r"{%-?\s+end" + block_type + r"\s+-?%}", file_content, re.MULTILINE
):
end = match.span()[1] # .span() gives tuple (start, end)
new_end_line = end # file_content[:start].count("\n")

if new_end_line >= start_line:
end_line = new_end_line
break

if end_line is None:
raise Exception(f"Unable to find a the closing end{block_type} block for {name}.")

return start_line, end_line

@staticmethod
def isolate_content(file_content: str, start: int, end: int) -> str:
"""Given content, a start position, and an end position, return the content of a Jinja block."""
return file_content[start:end]

@classmethod
def from_file(cls, path: Path, block_type: str, name: str) -> "JinjaBlock":
"""Find a specific Jinja block within a file, based on the block type and the name."""

file_content = open(path).read()
nicholasyager marked this conversation as resolved.
Show resolved Hide resolved
start, end = cls.find_block_range(file_content, block_type, name)
content = cls.isolate_content(file_content=file_content, start=start, end=end)

return cls(
path=path, block_type=block_type, name=name, start=start, end=end, content=content
)


def find_doc_reference(content: str) -> Set[str]:
"""Find all doc block references within a string."""
matches = re.findall(r"{{\sdoc\(\'?\"?([a-zA-Z0-9_\-\.]+)\'?\"?\)\s}}", content)

return set(matches)
4 changes: 3 additions & 1 deletion test-projects/split/split_proj/macros/_macros.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
macros:
- name: cents_to_dollars
description: Converts cents to dollars
description: Converts cents to dollars
- name: dollars_to_cents
description: Converts dollars to cents
5 changes: 5 additions & 0 deletions test-projects/split/split_proj/macros/cents_to_dollars.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@
{% macro cents_to_dollars(column_name, precision=2) -%}
({{ column_name }} / 100)::{{ type_numeric() }}(16, {{ precision }})
{%- endmacro %}


{% macro dollars_to_cents(column_name) -%}
({{ column_name }} * 100)::{{ type_numeric() }}(16, 0)
{%- endmacro %}
3 changes: 3 additions & 0 deletions test-projects/split/split_proj/models/docs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{% docs customer_id %}
The unique key for each customer.
{% enddocs %}
26 changes: 16 additions & 10 deletions test-projects/split/split_proj/models/marts/__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ version: 2

models:
- name: customers
description: Customer overview data mart, offering key details for each unique
description:
Customer overview data mart, offering key details for each unique
customer. One row per customer.
columns:
- name: customer_id
description: The unique key of the orders mart.
description: "{{ doc('customer_id') }}"
tests:
- not_null
- unique
Expand All @@ -19,20 +20,24 @@ models:
- name: last_ordered_at
description: The timestamp of a customer's most recent order.
- name: lifetime_spend_pretax
description: The sum of all the pre-tax subtotals of every order a customer
description:
The sum of all the pre-tax subtotals of every order a customer
has placed.
- name: lifetime_spend
description: The sum of all the order totals (including tax) that a customer
description:
The sum of all the order totals (including tax) that a customer
has ever placed.
- name: customer_type
description: Options are 'new' or 'returning', indicating if a customer has
description:
Options are 'new' or 'returning', indicating if a customer has
ordered more than once or has only placed their first order to date.
tests:
- accepted_values:
values: [new, returning]

- name: orders
description: Order overview data mart, offering key details for each order inlcluding
description:
Order overview data mart, offering key details for each order inlcluding
if it's a customer's first order and a food vs. drink item breakdown. One row
per order.
tests:
Expand All @@ -53,7 +58,8 @@ models:
to: ref('stg_customers')
field: customer_id
- name: location_id
description: The foreign key relating to the location the order was placed
description:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a note for future us -- we gotta see if ruamel can stop editing newlines 😅

The foreign key relating to the location the order was placed
at.
- name: order_total
description: The total amount of the order in USD including tax.
Expand All @@ -74,19 +80,19 @@ models:
- name: order_cost
description: The sum of supply expenses to fulfill the order.
- name: location_name
description: The full location name of where this order was placed. Denormalized
description:
The full location name of where this order was placed. Denormalized
from `stg_locations`.
- name: is_food_order
description: A boolean indicating if this order included any food items.
- name: is_drink_order
description: A boolean indicating if this order included any drink items.


- name: leaf_node
description: A leaf node model that is not referenced by any other model.
columns:
- name: order_id
description: The unique key of the leaf node.
tests:
- not_null
- unique
- unique
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ models:
description: Customer data with basic cleaning and transformation applied, one row per customer.
columns:
- name: customer_id
description: The unique key for each customer.
description: "{{ doc('customer_id') }}"
tests:
- not_null
- unique
Expand Down
Loading