Posted 18 Hours Ago Job ID: 2109118 12 quotes received

DataBricks Upload workbook in Sharepoint

Fixed PriceUnder $250

Quotes (12) · Premium Quotes (0) · Invited (0) · Hired (0)

Send before: August 06, 2025

Send a Quote

Programming & Development Programming & Software

Python JSON Programming Object-Oriented Programming PySpark

The below Databricks Notebook named "Workbooks.py" uses a metadata dictionary (control_metadata) to dynamically create multiple worksheets within two Excel workbooks—
workbook_staff and workbook_member. Each workbook is saved to a specified .xlsx
 file path and securely uploaded to both Azure Blob Storage and SharePoint Online. All worksheets 
 are password-protected, and data rows are programmatically hidden to prevent casual access or modification.

Issue:  
When the spreadsheets are reopened via SharePoint, the previously hidden rows become visible again, 
and worksheet protection appears to be lost—allowing users to view and potentially edit the contents.
workbooks.py:-
import os
import json
import logging
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, col
from datetime import datetime
import xlsxwriter

from azure.storage.blob import BlobServiceClient
from azure.identity import ClientSecretCredential
from office365.sharepoint.client_context import ClientContext
from io import BytesIO

# Setup logging
log = logging.getLogger("Pipeline-bcp_plus_critical_member_data")
log.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] - %(message)s"))
log.addHandler(handler)
log.info("Starting Pipeline bcp_plus_critical_member_data")

# Load JSON config
config_path = "Workspace/Shared/bcp/config.json"
with open(config_path, encoding="utf-8") as f:
    config = json.load(f)

# Read config values
file_path                        = config["file_path"]
staff_file_name                  = config["staff_file_name"]
member_file_name                 = config["member_file_name"]
output_container_name            = config["output_container_name"]
output_blob_staff                = config["output_blob_staff"]
output_blob_member               = config["output_blob_member"]
output_storage_connection_string = config["output_storage_connection_string"]
sp_file_path_staff               = config["SharePointPathStaff"]
sp_file_path_member              = config["SharePointPathMember"]
url                              = config["url"]
client_id                        = config["client_id"]

# Spark + Secrets
spark = SparkSession.builder.getOrCreate()

## Use correct secret key for workbook password
workbook_protection_password = dbutils.secrets.get("mdp-secrets", key="workbook-password")

# Retrieve managed identity credential
uc_credential              = "bac-mdp-dev-mel"
managedidentity_credential = dbutils.credentials.getServiceCredentialsProvider(uc_credential)

# Blob Service
output_blob_service_client = BlobServiceClient(
    account_url=output_storage_connection_string,
    credential=managedidentity_credential
)

xlsx_path_staff  = os.path.join(file_path, staff_file_name)
xlsx_path_member = os.path.join(file_path, member_file_name)

# -------------------------------------------------------------------
## Combine file‐open encryption + workbook structure protection
# -------------------------------------------------------------------
workbook_staff = xlsxwriter.Workbook(
    xlsx_path_staff,
    {
        'constant_memory': True,                         ## required for encryption to persist
        'strings_to_numbers': True,
        'password': workbook_protection_password,        ## file‐open AES encryption
        'workbook_properties': {                         ## lock workbook structure
            'protection': {
                'password':       workbook_protection_password,
                'lock_structure': True
            }
        }
    }
)

workbook_member = xlsxwriter.Workbook(
    xlsx_path_member,
    {
        'constant_memory': True,                         # >>> CHANGED
        'strings_to_numbers': True,
        'password': workbook_protection_password,        # >>> CHANGED
        'workbook_properties': {                         # >>> CHANGED
            'protection': {
                'password':       workbook_protection_password,
                'lock_structure': True
            }
        }
    }
)

# Cell sanitizer
def sanitize(val):
    if isinstance(val, np.ndarray):
        return json.dumps(val.tolist())
    if isinstance(val, np.generic):
        return val.item()
    if isinstance(val, pd.Timestamp):
        return val.to_pydatetime()
    if pd.isna(val):
        return None
    if isinstance(val, (dict, list)):
        return json.dumps(val)
    return val

#  control metadata
control_metadata = [
    {"worksheet_name": "MmeberAccount1", "rows_per_worksheet": 1000000,
     "extract_script": "select * from dev.TableMmeberAccount", "file_type": "Staff"},
    {"worksheet_name": "MmeberAccount2", "rows_per_worksheet": 1000000,
     "extract_script": "select * from dev.TableMmeberAccount", "file_type": "Staff"},
    {"worksheet_name": "MmeberAccount3", "rows_per_worksheet": 1000000,
     "extract_script": "select * from dev.TableMmeberAccount", "file_type": "member"},
    {"worksheet_name": "MmeberAccount4", "rows_per_worksheet": 1000000,
     "extract_script": "select * from dev.TableMmeberAccount", "file_type": "member"}
]
metadata_rows = control_metadata

for row in metadata_rows:
    ws_name   = (row["worksheet_name"] or "Sheet")[:25]
    script    = row["extract_script"]
    max_rows  = int(row["rows_per_worksheet"])
    file_type = row["file_type"].strip().lower()

    # pick the right workbook
    workbook = workbook_staff if file_type == "staff" else workbook_member

    try:
        log.info(f"Processing {file_type} worksheet: {ws_name}")
        df = spark.sql(script)

        # cast timestamps to string
        for field in df.schema.fields:
            if field.dataType.simpleString().startswith("timestamp"):
                df = df.withColumn(field.name, col(field.name).cast("string"))

        pdf = df.toPandas()
        total_rows = len(pdf)
        parts = (total_rows // max_rows) + (1 if total_rows % max_rows else 0)
        log.info(f"Pandas shape: {pdf.shape}, splitting into {parts} part(s)")

        for part in range(parts):
            start, end = part * max_rows, min((part + 1) * max_rows, total_rows)
            sheet_name = f"{ws_name}_{part+1}"[:31]
            sheet = workbook.add_worksheet(sheet_name)

            # Write headers
            for ci, cname in enumerate(pdf.columns):
                sheet.write(0, ci, cname)

            # Write data rows
            for ri, row_vals in enumerate(pdf.iloc[start:end].values, start=1):
                for ci, cell in enumerate(row_vals):
                    sheet.write(ri, ci, sanitize(cell))
                sheet.set_row(ri, None, None, {'hidden': True})

            ## Protect each sheet so rows stay hidden & cells locked
            sheet.protect(password=workbook_protection_password)

    except Exception as e:
        log.error(f"Error in worksheet {ws_name}: {e}", exc_info=True)
        continue

# Close workbooks to apply all encryption & protection
workbook_staff.close()
workbook_member.close()

# Upload to ADLS
for fname, xpath, blob_name in [
    (staff_file_name,  xlsx_path_staff,  output_blob_staff  + staff_file_name),
    (member_file_name, xlsx_path_member, output_blob_member + member_file_name)
]:
    blob_client = output_blob_service_client.get_blob_client(
        container=output_container_name,
        blob=blob_name
    )
    with open(xpath, "rb") as f:
        blob_client.upload_blob(f, overwrite=True)
        log.info(f"Uploaded {fname} to {output_container_name}/{blob_name}")

# SharePoint Upload
client_secret = dbutils.secrets.get("mdp-secrets", key="sp-client-secret")
tenant_id     = "yyyyy"
sp_cred       = ClientSecretCredential(tenant_id, client_id, client_secret)
ctx           = ClientContext(url).with_credentials(sp_cred)
ctx.web.get().execute_query()

folder_staff  = ctx.web.get_folder_by_server_relative_url(sp_file_path_staff)
folder_member = ctx.web.get_folder_by_server_relative_url(sp_file_path_member)

for fname, blob_name, folder in [
    (staff_file_name,  output_blob_staff,  folder_staff),
    (member_file_name, output_blob_member, folder_member),
]:
    blob_client = output_blob_service_client.get_blob_client(
        container=output_container_name,
        blob=blob_name
    )
    data   = blob_client.download_blob().readall()
    buffer = BytesIO(data)
    upload_file = folder.files.add(fname, buffer.getvalue(), overwrite=True).execute_query()
    log.info(f"Uploaded {fname} to SharePoint at: {upload_file.serverRelativeUrl}")
    
    


 ... Show more

Job Q&A

Become a member to ask a question, view Q&A, and get more benefits.

Similar Jobs

Excel Automation - Report Generation
Fixed Price or HourlyPosted: July 16, 2025
chatgpt plugin for excel
Fixed PricePosted: July 28, 2025
Program Interface for Spreadsheet Data
Fixed Price or HourlyPosted: August 04, 2025

Posted By

Praveen K

Australia


Feedback	No Feedback 100.0%
Total Spend	$2,011
Jobs Posted	13
Jobs Paid	1 (8%)
Paid Invoices	18 (100%)
Outstanding Invoices	0

More Jobs from Praveen K (0)

Add to Watchlist Send a Quote