Started using a local database earlier this year, to manage large datasets (eg for Twittee and Building a static website: BtoBSales.EU ) to overcome the limitations of Grist - size & latency.
##################### Connect to databaseimportsqlite3db=sqlite3.connect('/path/to/local/database.db')fromcollectionsimportnamedtupledefnamedtuple_factory(cursor,row):"""Returns sqlite rows as named tuples."""fields=[col[0]forcolincursor.description]x=namedtuple("x",fields)returnx(*row)db.row_factory=namedtuple_factoryc=db.cursor()####################c.execute(""" SELECT domain FROM companies WHERE name IS NULL; """)items=[x.domainforxinc.fetchall()]print()print(items)print()print('items',type(items),len(items))##################### Commit commanddb.commit()# Close connectiondb.close()
SELECT with variable string
c.execute(""" SELECT rowid, domain, url FROM twitter_webinars WHERE url LIKE "%jdsupra%" """)#######db=sqlite3.connect('/path/to/local/database.db')c=db.cursor()c.execute(""" SELECT first FROM first WHERE first LIKE ?; """,(f"{email_prefix.lower()}%",))potential_first=[x[0]forxinc.fetchall()]
initial test
importsqlite3# Connect to databaseconn=sqlite3.connect('/path/to/local/database.db')# Create a cursorc=conn.cursor()# Query The Databasec.execute("SELECT * FROM Companies")items=c.fetchall()foriteminitems:print(item)# Commit our commandconn.commit()# Close our connectionconn.close()
SQL headers
conn=sqlite3.connect('/path/to/local/database.db')# Create a cursorc=conn.cursor()# Query The Databasec.execute("SELECT * FROM Companies")withconn:headers=[i[0]foriinc.description]
defnamedtuple_factory(cursor,row):"""Returns sqlite rows as named tuples."""fields=[col[0]forcolincursor.description]Tweet=namedtuple("Tweet",fields)returnTweet(*row)conn.row_factory=namedtuple_factory
import CSV to SQLite
# EASIESTimportpandasaspddb=sqlite3.connect('/path/to/local/database.db')c=db.cursor()df=pd.read_csv('/path/to/local/Master.csv')df.to_sql('companies',db)####importcsvimportsqlite3withopen('/path/to/local/Master_copy.csv','r')asf:d=csv.DictReader(f)existing=[]content=[]foriind:ifi['tweet_id']notinexisting:existing.append(i['tweet_id'])content.append((i['tweet_id'],i['tweeted'],i['tweet_url'],i['text'],i['author_name'],i['author_website'],i['domain'],i['known'],i['url'],i['provider'],i['process'],i['src'],i['created'],i['updated'],))######print(f"CSV loaded with {len(content)} records.")# Connect to SQLitesqliteConnection=sqlite3.connect('/path/to/local/database.db')cursor=sqliteConnection.cursor()cursor.execute("DROP TABLE IF EXISTS twitter_webinars")# Create tabletable=""" CREATE TABLE twitter_webinars( tweet_id VARCHAR(255) PRIMARY KEY, tweeted INT, tweet_url CHAR(255), text VARCHAR(255), author_name VARCHAR(255), author_website VARCHAR(255), domain VARCHAR(255), known VARCHAR(255), url VARCHAR(255), provider VARCHAR(255), process VARCHAR(255), src VARCHAR(25), created FLOAT, updated FLOAT ); """cursor.execute(table)cursor.executemany("INSERT INTO twitter_webinars (tweet_id, tweeted, tweet_url, text, author_name, author_website, domain, known, url, provider, process, src, created, updated) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);",content)cursor.execute('SELECT tweet_id FROM twitter_webinars;')# View resultresult=cursor.fetchall()existing=[]forxinresult:ifxinexisting:print(f"ISSUE: {x}")else:existing.append(x)# Commit work and close connectionsqliteConnection.commit()cursor.close()sqliteConnection.close()
Rather obvious, but it is significantly faster to do a SQL query with sqlite3 than doing a list comprehension on all records.
Ex.:
existing_emails=[x.emailforxinselect_all_records(DB_BTOB,"people")ifx.email!=None]# 2.075 seconds to run script
vs.
importsqlite3conn=sqlite3.connect(DB_BTOB)cursor=conn.cursor()cursor.execute('SELECT email FROM people WHERE email IS NOT NULL')existing_emails=[row[0]forrowincursor.fetchall()]cursor.close()conn.close()# 0.046 seconds to run script