SQL
3 Raw JSON files
Data was imported using PostgreSQL. Cleaning was done with data as Text and JSON format with Nested Queries, Regex and Wildcards.
Tables With Established Relationships
The data was parsed into tables once all the information was extracted from the JSON properties.
-- Table: brands
------------------------------------------------
-- Create Table brands and import brands.json
CREATE TABLE brands (values json);
GRANT SELECT ON brands TO PUBLIC;
COPY brands from 'C:\Users\joszp\Documents\Fetch\brands.json';
-- Error found on line 750. Unexpected "" Inside "name" key and its value pair.
-- General Mills \"Big G\" Cereal"
DROP TABLE brands;
-- Create Table brands in order to store brands.json as text type
CREATE TABLE brands (values text);
GRANT SELECT ON brands TO PUBLIC;
COPY brands from 'C:\Users\joszp\Documents\Fetch\brands.json';
-- Verify table
SELECT * FROM brands;
-- Inspect row raising error
SELECT * FROM brands WHERE values LIKE '%Big G%';
-- Update brands table and replace improper "" using LIKE and Wildcards
UPDATE brands SET values = '{"_id":{"$oid":"5dc07e2aa60b873d6b0666d1"},"name":"General Mills Big G Cereal","cpg":{"$ref":"Cogs","$id":{"$oid":"53e10d6368abd3c7065097cc"}},"barcode":"511111006305","brandCode":"GENERAL MILLS CEREAL"}'
WHERE values LIKE '%Big G%';
-- Verify row raising error is corrected
SELECT * FROM brands WHERE values LIKE '%Big G%';
-- Convert brands type to JSON and manipulate as JSON
ALTER TABLE brands
ALTER COLUMN values type json USING "values"::json;
-- Verify table
SELECT * FROM brands;
-- Inspect JSON file and extract values to a new table
-- Extract all keys
SELECT DISTINCT(json_object_keys(values)) FROM brands; -- 8
-- Create table to store JSON parsed values
CREATE TABLE brands_parsed (
index SERIAL,
id VARCHAR,
barcode BIGINT,
category VARCHAR,
category_code VARCHAR,
cpg VARCHAR,
name VARCHAR,
brand_code VARCHAR,
top_brand BOOLEAN
);
-- Verify Table
SELECT * FROM brands_parsed;
DROP TABLE brands_parsed;
-- Identify the path for each feature
SELECT values FROM brands;
SELECT values->'_id' FROM brands;
SELECT values::json#>>'{_id,$oid}' FROM brands; --id
SELECT values->'barcode' FROM brands;
SELECT values->>'barcode' FROM brands;
SELECT values::json#>'{barcode}' FROM brands; -- barcode
SELECT values->'category' FROM brands; -- category
SELECT values->'categoryCode' FROM brands;
SELECT values::json#>'{cpg,$id,$oid}' FROM brands; -- categoryCode
SELECT values->'name' FROM brands; -- name
SELECT values->'brandCode' FROM brands; -- brandCode
SELECT values->'topBrand' FROM brands; -- topBrand
-- Extract data from brands.values json into brands_parsed
INSERT INTO brands_parsed(id, barcode, category, category_code, cpg, name, brand_code, top_brand)
SELECT values::json#>>'{_id,$oid}', CAST(values->>'barcode' AS BIGINT), values->>'category', values->>'categoryCode',
values::json#>>'{cpg,$id,$oid}', values->>'name', values->>'brandCode', CAST(values->>'topBrand' AS BOOLEAN)
FROM brands;
-- Clean text from name column
UPDATE brands_parsed SET name = TRIM(LEADING 'test brand @' FROM name)
WHERE name LIKE '%test brand @%';
-- Clean text from brand_code column
UPDATE brands_parsed SET brand_code = TRIM(LEADING 'TEST BRANDCODE @' FROM brand_code)
WHERE brand_code LIKE '%TEST BRANDCODE @%';
-- Verify Table
SELECT * FROM brands_parsed ORDER BY index;
------------------------------------------------
-- Table: receipts
------------------------------------------------
-- Create Table receipts and import receipts.json
CREATE TABLE receipts (values json);
GRANT SELECT ON receipts TO PUBLIC;
COPY receipts from 'C:\Users\joszp\Documents\Fetch\receipts.json';
-- Error found on line 431. Unexpected "" Inside "description" key and its value pair.
-- "description":"522 9" PLATE" (has extra " after the 9)
DROP TABLE receipts;
-- Create Table receipts in order to store receipts.json as text type
CREATE TABLE receipts (values text);
GRANT SELECT ON receipts TO PUBLIC;
COPY receipts from 'C:\Users\joszp\Documents\Fetch\receipts.json';
-- Verify table
SELECT * FROM receipts
FETCH FIRST 431 ROWS ONLY;
-- UPDATE receipts and replace values for error
UPDATE receipts SET values = REGEXP_REPLACE(values, ' 9" PLATE', ' 9 PLATE', 'g');
-- Convert brands type to JSON and manipulate as JSON
ALTER TABLE receipts
ALTER COLUMN values type json USING "values"::json;
-- Error found. Unexpected "" Inside "description" key and its value pair.
-- :"Black and White Easter Bunny 16 1/2" tall (has extra " after the 1/2)
-- UPDATE receipts and replace values for error
UPDATE receipts SET values = REGEXP_REPLACE(values, ' 1/2" tall', ' 1/2 tall', 'g');
-- Convert brands type to JSON and manipulate as JSON
ALTER TABLE receipts
ALTER COLUMN values type json USING "values"::json;
-- Verify table
SELECT * FROM receipts
-- Inspect JSON file and extract values to a new table
-- Extract all keys
SELECT DISTINCT(json_object_keys(values)) FROM receipts; -- 15 keys
-- "_id", "userId", "pointsEarned", "bonusPointsEarned", "purchaseDate" "modifyDate" "pointsAwardedDate" "dateScanned" "bonusPointsEarnedReason"
-- "rewardsReceiptStatus", "finishedDate" "rewardsReceiptItemList" "createDate" "totalSpent" "purchasedItemCount"
-- Identify the path for each feature
SELECT values FROM receipts;
SELECT values->'_id' FROM receipts;
SELECT values::json#>>'{_id,$oid}' FROM receipts; -- _id
SELECT values->>'userId' FROM receipts -- userId
SELECT values->>'pointsEarned' FROM receipts -- pointsEarned
SELECT values->>'bonusPointsEarned' FROM receipts -- bonusPointsEarned
SELECT values->>'purchaseDate' FROM receipts
SELECT values::json#>>'{purchaseDate,$date}' FROM receipts -- purchaseDate
SELECT values::json#>>'{modifyDate,$date}' FROM receipts -- modifyDate
SELECT values->>'pointsAwardedDate' FROM receipts
SELECT values::json#>>'{pointsAwardedDate,$date}' FROM receipts -- pointsAwardedDate
SELECT values->>'dateScanned' FROM receipts
SELECT values::json#>>'{dateScanned,$date}' FROM receipts -- dateScanned
SELECT values->>'bonusPointsEarnedReason' FROM receipts -- bonusPointsEarnedReason
SELECT values->>'rewardsReceiptStatus' FROM receipts -- rewardsReceiptStatus
SELECT values->>'finishedDate' FROM receipts
SELECT values::json#>>'{finishedDate,$date}' FROM receipts -- finishedDate
SELECT values->>'rewardsReceiptItemList' FROM receipts
SELECT values::json#>>'{rewardsReceiptItemList}' FROM receipts -- embedded JSON file as text.
-- SELECT CAST(values::json#>>'{rewardsReceiptItemList}' AS json) FROM receipts -- rewardsReceiptItemList converted from text to json
SELECT values->>'createDate' FROM receipts
SELECT values::json#>>'{createDate,$date}' FROM receipts -- createDate
SELECT values->>'totalSpent' FROM receipts -- totalSpent
SELECT values->>'purchasedItemCount' FROM receipts -- purchasedItemCount
-- "_id", "userId", "pointsEarned", "bonusPointsEarned", "purchaseDate" "modifyDate" "pointsAwardedDate" "dateScanned" "bonusPointsEarnedReason"
-- "rewardsReceiptStatus", "finishedDate" "rewardsReceiptItemList" "createDate" "totalSpent" "purchasedItemCount"
-- Create table to store JSON parsed values
CREATE TABLE receipts_parsed (
index SERIAL,
id VARCHAR,
user_id VARCHAR,
points_earned DECIMAL,
bonus_points_earned INT,
purchase_date BIGINT,
modify_date BIGINT,
points_awarded_date BIGINT,
date_scanned BIGINT,
bonus_points_reason VARCHAR,
rewards_receipt_status VARCHAR,
finished_date BIGINT,
rewards_receipt_item_list JSON,
create_date BIGINT,
total_spent DECIMAL,
purchased_items_count INT);
-- Verify Table
SELECT * FROM receipts_parsed;
DROP TABLE receipts_parsed;
-- Extract data from receipts into receipts_parsed
INSERT INTO receipts_parsed(id, user_id, points_earned, bonus_points_earned, purchase_date, modify_date, points_awarded_date, date_scanned,
bonus_points_reason, rewards_receipt_status, finished_date, rewards_receipt_item_list, create_date, total_spent, purchased_items_count)
SELECT values::json#>>'{_id,$oid}', values->>'userId', CAST(values->>'pointsEarned' AS DEC), CAST(values->>'bonusPointsEarned' AS INT),
CAST(values::json#>>'{purchaseDate,$date}' AS BIGINT), CAST(values::json#>>'{modifyDate,$date}' AS BIGINT), CAST(values::json#>>'{pointsAwardedDate,$date}' AS BIGINT),
CAST(values::json#>>'{dateScanned,$date}' AS BIGINT), values->>'bonusPointsEarnedReason', values->>'rewardsReceiptStatus', CAST(values::json#>>'{finishedDate,$date}' AS BIGINT),
CAST(values::json#>>'{rewardsReceiptItemList}' AS json), CAST(values::json#>>'{createDate,$date}' AS BIGINT), CAST(values->>'totalSpent' AS DEC), CAST(values->>'purchasedItemCount' AS INT)
FROM receipts;
-- Verify Table
SELECT * FROM receipts_parsed;
-- Convert dates from UNIX EPOCH TO DATE
SELECT purchase_date FROM receipts_parsed -- Inspect data type. Unich epox
SELECT to_timestamp(cast(receipts_parsed.purchase_date/1000 as bigint)) as test FROM receipts_parsed -- expression to change to timestamp
-- Alter table expression to change column data type and add expression for date/time conversion
ALTER TABLE receipts_parsed
ALTER COLUMN purchase_date TYPE timestamp USING to_timestamp(cast(receipts_parsed.purchase_date/1000 as bigint));
-- Verify Table
SELECT * FROM receipts_parsed;
-- Alter the rest of the columns to timestamp
ALTER TABLE receipts_parsed
ALTER COLUMN modify_date TYPE timestamp USING to_timestamp(cast(receipts_parsed.modify_date/1000 as bigint));
ALTER TABLE receipts_parsed
ALTER COLUMN points_awarded_date TYPE timestamp USING to_timestamp(cast(receipts_parsed.points_awarded_date/1000 as bigint));
ALTER TABLE receipts_parsed
ALTER COLUMN date_scanned TYPE timestamp USING to_timestamp(cast(receipts_parsed.date_scanned/1000 as bigint));
ALTER TABLE receipts_parsed
ALTER COLUMN finished_date TYPE timestamp USING to_timestamp(cast(receipts_parsed.finished_date/1000 as bigint));
ALTER TABLE receipts_parsed
ALTER COLUMN create_date TYPE timestamp USING to_timestamp(cast(receipts_parsed.create_date/1000 as bigint));
-- Verify Table
SELECT * FROM receipts_parsed;
------------------------------------------------
-- Table: users
------------------------------------------------
-- Create Table users and import users.json
CREATE TABLE users (values json);
GRANT SELECT ON users TO PUBLIC;
COPY users from 'C:\Users\joszp\Documents\Fetch\users.json';
-- Verify table
SELECT * FROM users
-- Inspect JSON file and extract values to a new table
-- Extract all keys
SELECT DISTINCT(json_object_keys(values)) FROM users; -- 7 keys
-- "_id", "createdDate", "state", "signUpSource", "role", "lastLogin", "active"
-- Identify the path for each feature
SELECT values FROM users;
SELECT values->>'_id' FROM users;
SELECT values::json#>>'{_id,$oid}' FROM users; -- _id
SELECT values->>'createdDate' FROM users;
SELECT values::json#>>'{createdDate,$date}' FROM users; -- createdDate
SELECT values->>'state' FROM users; -- state
SELECT values->>'signUpSource' FROM users; -- signUpSource
SELECT values->>'role' FROM users; -- role
SELECT values->>'lastLogin' FROM users;
SELECT values::json#>>'{lastLogin,$date}' FROM users; -- lastLogin
SELECT values->>'active' FROM users; -- active
-- Create table to store JSON parsed values
CREATE TABLE users_parsed (
index SERIAL,
user_id VARCHAR,
created_date BIGINT,
state VARCHAR,
sign_up_source VARCHAR,
role VARCHAR,
last_login BIGINT,
active BOOLEAN);
-- Verify table
SELECT * FROM users_parsed
DROP TABLE users_parsed
-- Extract data from users into users_parsed
INSERT INTO users_parsed(user_id, created_date, state, sign_up_source, role, last_login, active)
SELECT DISTINCT(values::json#>>'{_id,$oid}'), CAST(values::json#>>'{createdDate,$date}' AS BIGINT), values->>'state', values->>'signUpSource',
values->>'role', CAST(values::json#>>'{lastLogin,$date}' AS BIGINT), CAST(values->>'active' AS BOOLEAN)
FROM users;
-- Verify table
SELECT * FROM users_parsed
-- Alter tables expression to change column data type and add expression for date/time conversion
ALTER TABLE users_parsed
ALTER COLUMN created_date TYPE timestamp USING to_timestamp(cast(users_parsed.created_date/1000 as bigint));
ALTER TABLE users_parsed
ALTER COLUMN last_login TYPE timestamp USING to_timestamp(cast(users_parsed.last_login/1000 as bigint));
-- Verify table
SELECT * FROM receipts_parsed
--------------
-- Add Keys
--------------
ALTER TABLE brands_parsed ADD PRIMARY KEY(id);
ALTER TABLE receipts_parsed ADD PRIMARY KEY(id);
ALTER TABLE receipts_parsed ADD FOREIGN KEY(user_id)
REFERENCES users_parsed(user_id);
ALTER TABLE users_parsed ADD PRIMARY KEY(user_id);