Copying Images from document #1457

rbp9802 · 2024-12-21T00:24:45Z

Hello everyone!
I'm trying to copy the contenct of a document to another document but I failed to copy the image relations.

So:

copy_body(source_doc, target_doc) copy de body from source_doc to target_doc. Relationships are not adjusted.
copy_relations(source_doc, target_doc) add relationships present in source_doc to target_doc and save de new rId's as a dict (old_rId --> new rId)
After this, I look for images copied in target_file by for pic in element.xpath('.//a:blip') and change rId present in pic.attrib[qn('r:embed')] to new_rId
Pictures are not available to show when opening the saved document

import os
from io import BytesIO
from docx import Document
from copy import deepcopy
from docx.oxml.ns import qn

def copy_body(source_doc, target_doc):
    for element in source_doc.element.body.iterchildren():
        copied_element = deepcopy(element)
        target_doc.element.body.append(copied_element)
        
def copy_relations(source_doc, target_doc):
    new_rels = {}
    for rId in sorted(source_doc.part.rels):
        rel = source_doc.part.rels[rId]
        if "image" in rel.target_ref:
            # Get image binary data
            img_part = rel.target_part
            img_data = img_part._blob
            
            # Create new image_part from binary data
            my_bytesio = BytesIO(img_data)
            image_part = target_doc.part.package.get_or_add_image_part(my_bytesio)
            
            # Create new relationship
            new_rId = target_doc.part.relate_to(image_part, 'image')
            new_rel = target_doc.part.rels[new_rId]
            print(f'Image copied: {rId} {rel.target_ref} {my_bytesio.getbuffer().nbytes/1024:.3f}kB " + \
                  f"--> {new_rId} {new_rel.target_ref} {BytesIO(image_part._blob).getbuffer().nbytes/1024:.3f}kB')
            new_rels[rId] = new_rId
    return new_rels

folder = os.path.dirname(__file__)
source_file = 'doc_2_copy.docx'
target_file = 'src_doc.docx'
final_file = 'new_doc.docx'

source_doc = Document(os.path.join(folder, source_file))
target_doc = Document(os.path.join(folder, target_file))

copy_body(source_doc, target_doc)  # Copy body
new_rels = copy_relations(source_doc, target_doc)  # Copy relations and save rId's in new_rels

# Modify relations rId
for element in target_doc.element.body.iterchildren():
    # Look for images in new doc body
    for pic in element.xpath('.//a:blip'):
        if qn('r:embed') in pic.attrib:
            rId = pic.attrib[qn('r:embed')]  # Getting old rId
            new_rId = new_rels[rId]
            print(f"Image rID : {rId}")
            if rId != new_rId:
                print(f'  Modifying rId: {rId} -> {new_rId}')
                pic.attrib[utils.qn('r:embed')] = new_rId  # Setting new rId

target_doc.save(final_file)

Running the script prints in terminal:

Image copied: rId5 media/image1.png 28.188kB --> rId6 media/image1.png 28.188kB
Image copied: rId6 media/image2.png 24.028kB --> rId7 media/image2.png 24.028kB
Image rID: rId5
  Modifying rId: rId5 -> rId6
Image rID: rId6
  Modifying rId: rId6 -> rId7

I don't know what I'm missing, help please!

The text was updated successfully, but these errors were encountered:

rbp9802 · 2024-12-23T19:19:36Z

UPDATE:

I manage to track picture location in document delete them and re-add as new picture. This is a much longer way than spected but ir works.

Anyway I appreciate any comments on the original issue.

Thanks!

The process was:

Extract original pictures from source document and save them in the device
Copy source document body via deepcopy
Find new pictures in target document, delete and re-add them as new picture.

import os
from docx import Document
from copy import deepcopy

from docx.text.paragraph import Paragraph
from docx.text.run import Run

from docx.oxml.ns import qn
from lxml import etree

def copy_body(source_doc, target_doc, next_to_element=None):
    # If no next_to_element add next to last body element
    if next_to_element is None:
        next_to_element = target_doc.element.body[-1]
        
    for element in source_doc.element.body[::-1]:
        copied_element = deepcopy(element)
        next_to_element.addnext(copied_element)

def check_images(doc):
    added_images = []
    for element in doc.element.body.iterchildren():
        for pic in element.xpath('.//a:blip'):
            if qn('r:embed') in pic.attrib:
                # Function to check if picture exist in source document here... 
                # (same rId could already exist in target document)

                # Get relation Id
                rId = pic.attrib[qn('r:embed')]
                
                # Get run that contains picture
                r = get_parent_run(pic) 
                p = r.getparent()
                paragraph = Paragraph(p, doc)
                run = Run(r, paragraph)
                
                # Get picture size
                drawing = run._element.find(qn('w:drawing'))
                cx = drawing.find(qn('wp:inline')).find(qn('wp:extent')).attrib['cx']
                cy = drawing.find(qn('wp:inline')).find(qn('wp:extent')).attrib['cy']
                
                # Remove old drawing
                run._element.remove(drawing)
                
                # Add picture to new run
                inline_shape = run.add_picture(os.path.join(output_folder, rId + '.png'))
                inline_shape.width = int(cx)
                inline_shape.height = int(cy)
                
                added_images.append(inline_shape)
    return added_images
                
def get_parent_run(element):
    while element.tag != qn('w:r'):
        element = element.getparent()
    return element

def extract_images(doc, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for rId, rel in doc.part.rels.items():
        if "image" in rel.target_ref:
            img_part = rel.target_part
            img_data = img_part._blob
            extension = os.path.splitext(rel.target_ref)[1] or ".png"
            output_path = os.path.join(output_folder, f"{rId}{extension}")
            with open(output_path, "wb") as f:
                f.write(img_data)
                print(f"Imagen extraída: {output_path}")

# Example
folder = os.path.dirname(__file__)
source_file = 'document_to_copy.docx'
target_file = 'document_base.docx'
output_file = 'document_copy.docx'

output_folder = 'extracted_images'

# Open documentrs
source_doc = Document(os.path.join(folder, source_file))
target_doc = Document(os.path.join(folder, target_file))

# Extract source_doc images
extract_images(source_doc, output_folder)

# Copy body
after_paragraph = target_doc.paragraphs[0]
copy_body(source_doc, target_doc, next_to_element=after_paragraph._element)

# Check images
added_images = check_images(target_doc)

target_doc.save(output_file)
print(f"Document saved as: {output_file}")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Copying Images from document #1457

Copying Images from document #1457

rbp9802 commented Dec 21, 2024

rbp9802 commented Dec 23, 2024 •

edited

Loading

Copying Images from document #1457

Copying Images from document #1457

Comments

rbp9802 commented Dec 21, 2024

rbp9802 commented Dec 23, 2024 • edited Loading

rbp9802 commented Dec 23, 2024 •

edited

Loading