Azure SQL Hyperscale – Best approach to merge large main table with extension tables

Janice Chi 620 Reputation points
2025-08-19T17:02:13.39+00:00

We are working with Azure SQL Hyperscale. The environment has one large table of around 19 TB, along with two smaller extension tables of around 1.5 TB each. These extension tables (_EXT) share the same primary keys as the main table but contain different columns.

The requirement is to merge these extension tables into the main table inside Hyperscale. The goal is to complete this with minimal compute usage and with reduced chances of failure, given the scale of data involved.

What is the recommended way, best practices to perform such a merge in Azure SQL Hyperscale to balance efficiency and reliability at this data volume?

please note this:These extension tables (_EXT) share the same primary keys as the main table but contain different columns.

Azure SQL Database
{count} votes

1 answer

Sort by: Most helpful
  1. Oury Ba-MSFT 21,111 Reputation points Microsoft Employee Moderator
    2025-08-19T19:54:55.89+00:00

    @Janice Chi Thank you for reaching out.

    You can add new columns to main table and use UPDATE to update the values from _EXT tables to main table.

     

    Please see an example below.

     

     -- Base table

    CREATE TABLE dbo.SalesMain

    (

        SalesID       BIGINT        NOT NULL,

        OrderDate     DATE          NOT NULL,

        CustomerID    INT           NOT NULL,

        Amount        DECIMAL(18,2) NOT NULL,

        -- New columns to be merged in (start as absent; we'll add using ALTER later)

        -- Ext1_Notes   NVARCHAR(200)  NULL,

        -- Ext2_TaxCode VARCHAR(20)    NULL,

        CONSTRAINT PK_SalesMain PRIMARY KEY CLUSTERED (SalesID)

    );

    GO

    -- Extension 1: shares PK, adds new columns

    CREATE TABLE dbo.SalesMain_EXT1

    (

        SalesID    BIGINT        NOT NULL PRIMARY KEY,

        Ext1_Notes NVARCHAR(200) NULL

    );

    GO

    -- Extension 2: shares PK, adds different columns

    CREATE TABLE dbo.SalesMain_EXT2

    (

        SalesID     BIGINT       NOT NULL PRIMARY KEY,

        Ext2_TaxCode VARCHAR(20) NULL

    );

    GO

    -- Sample data (tiny)

    INSERT dbo.SalesMain (SalesID, OrderDate, CustomerID, Amount)

    VALUES (1,'2025-01-01',101,100.00),(2,'2025-01-02',102,200.00),(3,'2025-01-03',103,300.00);

    INSERT dbo.SalesMain_EXT1 (SalesID, Ext1_Notes)

    VALUES (1,N'First order'),(3,N'VIP customer');

    INSERT dbo.SalesMain_EXT2 (SalesID, Ext2_TaxCode)

    VALUES (1,'GST5'),(2,'GST12');

     

     

    ALTER TABLE dbo.SalesMain ADD Ext1_Notes NVARCHAR(200) NULL, Ext2_TaxCode VARCHAR(20) NULL;

     

    -- table to track progress

    CREATE TABLE dbo.MergeProgress

    (

        Phase        VARCHAR(30) NOT NULL,  -- 'EXT1' or 'EXT2'

        LastID       BIGINT      NOT NULL,

        StartedAt    DATETIME2   NOT NULL DEFAULT SYSUTCDATETIME(),

        RowsDone     BIGINT      NOT NULL DEFAULT 0,

        CONSTRAINT PK_MergeProgress PRIMARY KEY (Phase)

    );

     

    INSERT INTO dbo.MergeProgress(Phase, LastID, RowsDone) VALUES ('EXT1', 0, 0), ('EXT2', 0, 0);

     

     

    SET NOCOUNT ON;

    DECLARE @BatchSize INT = 50000;       -- tune to your log budget

    DECLARE @Rows INT, @Last BIGINT;

     

    WHILE 1=1

    BEGIN

        SELECT @Last = LastID FROM dbo.MergeProgress WHERE Phase='EXT1';

     

        ;WITH next_ids AS

        (

            SELECT TOP (@BatchSize) e.SalesID

            FROM dbo.SalesMain_EXT1 AS e

            WHERE e.SalesID > @Last

            ORDER BY e.SalesID

        )

        UPDATE M

            SET M.Ext1_Notes = E.Ext1_Notes

        FROM dbo.SalesMain AS M

        JOIN next_ids AS N ON N.SalesID = M.SalesID

        JOIN dbo.SalesMain_EXT1 AS E ON E.SalesID = N.SalesID;

     

        SET @Rows = @@ROWCOUNT;

        IF @Rows = 0 BREAK;

     

        -- Advance checkpoint

        UPDATE dbo.MergeProgress

           SET LastID = (SELECT MAX(SalesID) FROM dbo.SalesMain_EXT1 WHERE SalesID > @Last),

               RowsDone = RowsDone + @Rows

         WHERE Phase = 'EXT1';

     

        -- Gentle throttle to stay under log rate governance if needed

        WAITFOR DELAY '00:00:01';

    END

     

    SET NOCOUNT ON;

    DECLARE @BatchSize2 INT = 50000, @Rows2 INT, @Last2 BIGINT;

     

    WHILE 1=1

    BEGIN

        SELECT @Last2 = LastID FROM dbo.MergeProgress WHERE Phase='EXT2';

     

        ;WITH next_ids AS

        (

            SELECT TOP (@BatchSize2) e.SalesID

            FROM dbo.SalesMain_EXT2 AS e

            WHERE e.SalesID > @Last2

            ORDER BY e.SalesID

        )

        UPDATE M

            SET M.Ext2_TaxCode = E.Ext2_TaxCode

        FROM dbo.SalesMain AS M

        JOIN next_ids AS N ON N.SalesID = M.SalesID

        JOIN dbo.SalesMain_EXT2 AS E ON E.SalesID = N.SalesID;

     

        SET @Rows2 = @@ROWCOUNT;

        IF @Rows2 = 0 BREAK;

     

        UPDATE dbo.MergeProgress

           SET LastID = (SELECT MAX(SalesID) FROM dbo.SalesMain_EXT2 WHERE SalesID > @Last2),

               RowsDone = RowsDone + @Rows2

         WHERE Phase = 'EXT2';

     

        WAITFOR DELAY '00:00:01';

    END

     

    -- validate

    SELECT 'EXT1' AS Src, COUNT(*) FROM dbo.SalesMain_EXT1

    UNION ALL

    SELECT 'EXT1 Merged', COUNT(*) FROM dbo.SalesMain WHERE Ext1_Notes IS NOT NULL;

     

    SELECT 'EXT2' AS Src, COUNT(*) FROM dbo.SalesMain_EXT2

    UNION ALL

    SELECT 'EXT2 Merged', COUNT(*) FROM dbo.SalesMain WHERE Ext2_TaxCode IS NOT NULL;

    Please do let us know if this is helpful.

    1 person found this answer helpful.

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.