Skip to content

https://github.com/paradedb/paradedb/blob/dev/pg_bm25/README.md#

{"payload":{"allShortcutsEnabled":false,"fileTree":{"pg_bm25":{"items":[{"name":"sql","path":"pg_bm25/sql","contentType":"directory"},{"name":"src","path":"pg_bm25/src","contentType":"directory"},{"name":"test","path":"pg_bm25/test","contentType":"directory"},{"name":".gitignore","path":"pg_bm25/.gitignore","contentType":"file"},{"name":"Cargo.lock","path":"pg_bm25/Cargo.lock","contentType":"file"},{"name":"Cargo.toml","path":"pg_bm25/Cargo.toml","contentType":"file"},{"name":"README.md","path":"pg_bm25/README.md","contentType":"file"},{"name":"pg_bm25.control","path":"pg_bm25/pg_bm25.control","contentType":"file"}],"totalCount":8},"":{"items":[{"name":".cargo","path":".cargo","contentType":"directory"},{"name":".github","path":".github","contentType":"directory"},{"name":"benchmarks","path":"benchmarks","contentType":"directory"},{"name":"docker","path":"docker","contentType":"directory"},{"name":"docs","path":"docs","contentType":"directory"},{"name":"pg_bm25","path":"pg_bm25","contentType":"directory"},{"name":"pg_search","path":"pg_search","contentType":"directory"},{"name":"scripts","path":"scripts","contentType":"directory"},{"name":".gitignore","path":".gitignore","contentType":"file"},{"name":".hadolint.yaml","path":".hadolint.yaml","contentType":"file"},{"name":".markdownlint.yaml","path":".markdownlint.yaml","contentType":"file"},{"name":".pre-commit-config.yaml","path":".pre-commit-config.yaml","contentType":"file"},{"name":".prettierignore","path":".prettierignore","contentType":"file"},{"name":"CODE_OF_CONDUCT.md","path":"CODE_OF_CONDUCT.md","contentType":"file"},{"name":"CONTRIBUTING.md","path":"CONTRIBUTING.md","contentType":"file"},{"name":"Cargo.lock","path":"Cargo.lock","contentType":"file"},{"name":"Cargo.toml","path":"Cargo.toml","contentType":"file"},{"name":"LICENSE","path":"LICENSE","contentType":"file"},{"name":"README.md","path":"README.md","contentType":"file"},{"name":"SECURITY.md","path":"SECURITY.md","contentType":"file"},{"name":"THANKYOU.md","path":"THANKYOU.md","contentType":"file"}],"totalCount":21}},"fileTreeProcessingTime":5.458953,"foldersToFetch":[],"reducedMotionEnabled":null,"repo":{"id":660776503,"defaultBranch":"dev","name":"paradedb","ownerLogin":"paradedb","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-06-30T20:21:47.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/124464601?v=4","public":true,"private":false,"isOrgOwned":true},"symbolsExpanded":false,"treeExpanded":true,"refInfo":{"name":"dev","listCacheKey":"v0:1696784857.0","canEdit":false,"refType":"branch","currentOid":"bb4f2890942b85be3e9736bb3e8f17dcf659c0a1"},"path":"pg_bm25/README.md","currentUser":null,"blob":{"rawLines":null,"stylingDirectives":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/paradedb/paradedb/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":null,"repoAlertsPath":"/paradedb/paradedb/security/dependabot","repoSecurityAndAnalysisPath":"/paradedb/paradedb/settings/security_analysis","repoOwnerIsOrg":true,"currentUserCanAdminRepo":false},"displayName":"README.md","displayUrl":"https://github.com/paradedb/paradedb/blob/dev/pg_bm25/README.md?raw=true","headerInfo":{"blobSize":"6.7 KB","deleteInfo":{"deleteTooltip":"You must be signed in to make or propose changes"},"editInfo":{"editTooltip":"You must be signed in to make or propose changes"},"ghDesktopPath":"https://desktop.github.com","gitLfsPath":null,"onBranch":true,"shortPath":"2838c67","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2Fparadedb%2Fparadedb%2Fblob%2Fdev%2Fpg_bm25%2FREADME.md","isCSV":false,"isRichtext":true,"toc":[{"level":1,"text":"","anchor":"--","htmlText":""},{"level":2,"text":"Overview","anchor":"overview","htmlText":"Overview"},{"level":3,"text":"Roadmap","anchor":"roadmap","htmlText":"Roadmap"},{"level":2,"text":"Running the Extension","anchor":"running-the-extension","htmlText":"Running the Extension"},{"level":3,"text":"From ParadeDB","anchor":"from-paradedb","htmlText":"From ParadeDB"},{"level":3,"text":"From Self-Hosted Postgres","anchor":"from-self-hosted-postgres","htmlText":"From Self-Hosted Postgres"},{"level":2,"text":"Usage","anchor":"usage","htmlText":"Usage"},{"level":3,"text":"Indexing","anchor":"indexing","htmlText":"Indexing"},{"level":3,"text":"Basic Search","anchor":"basic-search","htmlText":"Basic Search"},{"level":2,"text":"Development","anchor":"development","htmlText":"Development"},{"level":3,"text":"Prerequisites","anchor":"prerequisites","htmlText":"Prerequisites"},{"level":3,"text":"Running the Extension","anchor":"running-the-extension-1","htmlText":"Running the Extension"},{"level":3,"text":"Modifying the Extension","anchor":"modifying-the-extension","htmlText":"Modifying the Extension"},{"level":3,"text":"Testing","anchor":"testing","htmlText":"Testing"},{"level":2,"text":"License","anchor":"license","htmlText":"License"}],"lineInfo":{"truncatedLoc":"260","truncatedSloc":"188"},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplateHelpUrl":"https://docs.github.com/articles/about-issue-and-pull-request-templates","issueTemplate":null,"discussionTemplate":null,"language":"Markdown","languageID":222,"large":false,"loggedIn":false,"newDiscussionPath":"/paradedb/paradedb/discussions/new","newIssuePath":"/paradedb/paradedb/issues/new","planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/paradedb/paradedb/blob/dev/pg_bm25/README.md","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","dismissStackNoticePath":"/settings/dismiss-notice/publish_stack_from_file","releasePath":"/paradedb/paradedb/releases/new?marketplace=true","showPublishActionBanner":false,"showPublishStackBanner":false},"rawBlobUrl":"https://github.com/paradedb/paradedb/raw/dev/pg_bm25/README.md","renderImageOrRaw":false,"richText":"

\n \"pg_bm25\"\n
\n

\n

\"Testing\"

\n

Overview

\n

pg_bm25 is a PostgreSQL extension that enables full text search over SQL tables\nusing the BM25 algorithm, the state-of-the-art ranking function\nfor full text search. It is built on top of Tantivy, the Rust-based alternative to Apache\nLucene, using pgrx.

\n

pg_bm25 is supported on PostgreSQL 11+.

\n

Check out the pg_bm25 benchmarks here.

\n

Roadmap

\n
    \n
  • BM25 scoring
  • \n
  • Highlighting
  • \n
  • Boosted queries
  • \n
  • Filtering
  • \n
  • Bucket and metrics aggregations
  • \n
  • Autocomplete
  • \n
  • Fuzzy search
  • \n
  • Custom tokenizers
  • \n
  • JSON field search
  • \n
  • Datetime aggregations
  • \n
  • Facet fields
  • \n
\n

Running the Extension

\n

From ParadeDB

\n

The easiest way to test the extension is to run the ParadeDB Dockerfile:

\n
docker run \\n  -e POSTGRES_USER=<user> \\n  -e POSTGRES_PASSWORD=<password> \\n  -e POSTGRES_DB=<dbname> \\n  -p 5432:5432 \\n  -d \\n  paradedb/paradedb:latest
\n

This will spin up a Postgres instance with pg_bm25 preinstalled.

\n

From Self-Hosted Postgres

\n

If you are self-hosting Postgres and would like to use the extension within your existing\nPostgres, follow these steps:

\n
    \n
  1. Install Rust and cargo-pgrx:
  2. \n
\n
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh\ncargo install cargo-pgrx --version 0.9.8
\n
    \n
  1. Then, run:
  2. \n
\n
which pg_config\ncargo pgrx install\">
# Clone the repo (optionally pick a specific version)\ngit clone https://github.com/paradedb/paradedb.git --tag <VERSION>\n\n# Install pg_bm25\ncd pg_bm25/\ncargo pgrx init --pg<YOUR-POSTGRES-MAJOR_VERSION>=</span>which pg_config<span class=\"pl-pds\">\ncargo pgrx install
\n

You can then create the extension in your database by running:

\n
CREATE EXTENSION pg_bm25;
\n

If you are using a managed Postgres service like Amazon RDS, you will not be able to install pg_bm25\nuntil the Postgres service explicitly supports it.

\n

Usage

\n

Indexing

\n

By default, the pg_bm25 extension creates a table called paradedb.mock_items\nthat you can use for quick experimentation.

\n

To index a table, use the following SQL command:

\n
CREATE TABLE mock_items AS SELECT  FROM paradedb.mock_items;\n\nCREATE INDEX idx_mock_items\nON mock_items\nUSING bm25 ((mock_items.))\nWITH (text_fields='{\"description\": {}, \"category\": {}}');
\n

Once the indexing is complete, you can run various search functions on it.

\n

Basic Search

\n

Execute a search query on your indexed table:

\n
SELECT description, rating, category\nFROM mock_items\nWHERE mock_items @@@ 'description:keyboard OR category:electronics'\nLIMIT 5;
\n

This will return:

\n
         description         | rating |  category\n-----------------------------+--------+-------------\n Plastic Keyboard            |      4 | Electronics\n Ergonomic metal keyboard    |      4 | Electronics\n Innovative wireless earbuds |      5 | Electronics\n Fast charging power bank    |      4 | Electronics\n Bluetooth-enabled speaker   |      3 | Electronics\n(5 rows)\n
\n

Scoring and highlighting are supported:

\n
SELECT description, rating, category, paradedb.rank_bm25(ctid), paradedb.highlight_bm25(ctid, 'description')\nFROM mock_items\nWHERE mock_items @@@ 'description:keyboard OR category:electronics'\nLIMIT 5;
\n

This will return:

\n
 id |         description         | rating |  category   | rank_bm25 |         highlight_bm25\n----+-----------------------------+--------+-------------+-----------+---------------------------------\n  1 | Ergonomic metal keyboard    |      4 | Electronics | 4.9403534 | Ergonomic metal <b>keyboard</b>\n  2 | Very plasticy keyboard      |      4 | Electronics | 4.9403534 | Very plasticy <b>keyboard</b>\n 12 | Innovative wireless earbuds |      5 | Electronics | 2.1096356 |\n 22 | Fast charging power bank    |      4 | Electronics | 2.1096356 |\n 32 | Bluetooth-enabled speaker   |      3 | Electronics | 2.1096356 |\n(5 rows)\n
\n

Scores can be tuned via boosted queries:

\n
SELECT description, rating, category\nFROM mock_items\nWHERE mock_items @@@ 'description:keyboard^2 OR category:electronics';
\n

New data that arrives or rows that are changed are automatically reindexed and searchable. For instance,\nlet's create and search for a new row in our table:

\n
INSERT INTO mock_items (description, rating, category) VALUES ('New keyboard', 5, 'Electronics');\n\nSELECT description, rating, category\nFROM mock_items\nWHERE mock_items @@@ 'description:keyboard OR category:electronics'\nLIMIT 5;
\n

This will return:

\n
         description         | rating |  category\n-----------------------------+--------+-------------\n New keyboard                |      5 | Electronics\n Plastic Keyboard            |      4 | Electronics\n Ergonomic metal keyboard    |      4 | Electronics\n Innovative wireless earbuds |      5 | Electronics\n Fast charging power bank    |      4 | Electronics\n(5 rows)\n
\n

Please refer to the documentation for a more thorough overview\nof pg_bm25's query support.

\n

Development

\n

Prerequisites

\n

Before developing the extension, ensure that you have Rust installed\n(version >1.70), ideally via rustup (we've observed issues with installing Rust\nvia Homebrew on macOS).

\n

Then, install and initialize pgrx:

\n
cargo install cargo-pgrx --version 0.9.8\ncargo pgrx init
\n

Running the Extension

\n

First, start pgrx:

\n
cargo pgrx run
\n

This will launch an interactive connection to Postgres. Inside Postgres, create\nthe extension by running:

\n
CREATE EXTENSION pg_bm25;
\n

Now, you have access to all the extension functions.

\n

Modifying the Extension

\n

If you make changes to the extension code, follow these steps to update it:

\n
    \n
  1. Recompile the extension:
  2. \n
\n
cargo pgrx run
\n
    \n
  1. Recreate the extension to load the latest changes:
  2. \n
\n
DROP EXTENSION pg_bm25;\nCREATE EXTENSION pg_bm25;
\n

Testing

\n

To run the unit test suite, use the following command:

\n
cargo pgrx test
\n

This will run all unit tests defined in /src. To add a new unit test, simply add\ntests inline in the relevant files, using the #[cfg(test)] attribute.

\n

To run the integration test suite, simply run:

\n
./test/runtests.sh
\n

This will create a temporary database, initialize it with the SQL commands defined\nin fixtures.sql, and run the tests in /test/sql against it. To add a new test,\nsimply add a new .sql file to /test/sql and a corresponding .out file to\n/test/expected for the expected output, and it will automatically get picked up\nby the test suite.

\n

License

\n

pg_bm25 is licensed under the GNU Affero General Public License v3.0.

\n
","renderedFileInfo":null,"shortPath":null,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"repoOwner":"paradedb","repoName":"paradedb","showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","showDependabotConfigurationBanner":false,"actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":{"timedOut":false,"notAnalyzed":false,"symbols":[{"name":" Overview","kind":"section_2","identStart":275,"identEnd":284,"extentStart":273,"extentEnd":918,"fullyQualifiedName":" Overview","identUtf16":{"start":{"lineNumber":7,"utf16Col":2},"end":{"lineNumber":7,"utf16Col":11}},"extentUtf16":{"start":{"lineNumber":7,"utf16Col":0},"end":{"lineNumber":32,"utf16Col":0}}},{"name":" Roadmap","kind":"section_3","identStart":660,"identEnd":668,"extentStart":657,"extentEnd":918,"fullyQualifiedName":" Roadmap","identUtf16":{"start":{"lineNumber":18,"utf16Col":3},"end":{"lineNumber":18,"utf16Col":11}},"extentUtf16":{"start":{"lineNumber":18,"utf16Col":0},"end":{"lineNumber":32,"utf16Col":0}}},{"name":" Running the Extension","kind":"section_2","identStart":920,"identEnd":942,"extentStart":918,"extentEnd":2100,"fullyQualifiedName":" Running the Extension","identUtf16":{"start":{"lineNumber":32,"utf16Col":2},"end":{"lineNumber":32,"utf16Col":24}},"extentUtf16":{"start":{"lineNumber":32,"utf16Col":0},"end":{"lineNumber":83,"utf16Col":0}}},{"name":" From ParadeDB","kind":"section_3","identStart":947,"identEnd":961,"extentStart":944,"extentEnd":1274,"fullyQualifiedName":" From ParadeDB","identUtf16":{"start":{"lineNumber":34,"utf16Col":3},"end":{"lineNumber":34,"utf16Col":17}},"extentUtf16":{"start":{"lineNumber":34,"utf16Col":0},"end":{"lineNumber":50,"utf16Col":0}}},{"name":" From Self-Hosted Postgres","kind":"section_3","identStart":1277,"identEnd":1303,"extentStart":1274,"extentEnd":2100,"fullyQualifiedName":" From Self-Hosted Postgres","identUtf16":{"start":{"lineNumber":50,"utf16Col":3},"end":{"lineNumber":50,"utf16Col":29}},"extentUtf16":{"start":{"lineNumber":50,"utf16Col":0},"end":{"lineNumber":83,"utf16Col":0}}},{"name":" Usage","kind":"section_2","identStart":2102,"identEnd":2108,"extentStart":2100,"extentEnd":5214,"fullyQualifiedName":" Usage","identUtf16":{"start":{"lineNumber":83,"utf16Col":2},"end":{"lineNumber":83,"utf16Col":8}},"extentUtf16":{"start":{"lineNumber":83,"utf16Col":0},"end":{"lineNumber":185,"utf16Col":0}}},{"name":" Indexing","kind":"section_3","identStart":2113,"identEnd":2122,"extentStart":2110,"extentEnd":2579,"fullyQualifiedName":" Indexing","identUtf16":{"start":{"lineNumber":85,"utf16Col":3},"end":{"lineNumber":85,"utf16Col":12}},"extentUtf16":{"start":{"lineNumber":85,"utf16Col":0},"end":{"lineNumber":103,"utf16Col":0}}},{"name":" Basic Search","kind":"section_3","identStart":2582,"identEnd":2595,"extentStart":2579,"extentEnd":5214,"fullyQualifiedName":" Basic Search","identUtf16":{"start":{"lineNumber":103,"utf16Col":3},"end":{"lineNumber":103,"utf16Col":16}},"extentUtf16":{"start":{"lineNumber":103,"utf16Col":0},"end":{"lineNumber":185,"utf16Col":0}}},{"name":" Development","kind":"section_2","identStart":5216,"identEnd":5228,"extentStart":5214,"extentEnd":6766,"fullyQualifiedName":" Development","identUtf16":{"start":{"lineNumber":185,"utf16Col":2},"end":{"lineNumber":185,"utf16Col":14}},"extentUtf16":{"start":{"lineNumber":185,"utf16Col":0},"end":{"lineNumber":257,"utf16Col":0}}},{"name":" Prerequisites","kind":"section_3","identStart":5233,"identEnd":5247,"extentStart":5230,"extentEnd":5531,"fullyQualifiedName":" Prerequisites","identUtf16":{"start":{"lineNumber":187,"utf16Col":3},"end":{"lineNumber":187,"utf16Col":17}},"extentUtf16":{"start":{"lineNumber":187,"utf16Col":0},"end":{"lineNumber":200,"utf16Col":0}}},{"name":" Running the Extension","kind":"section_3","identStart":5534,"identEnd":5556,"extentStart":5531,"extentEnd":5805,"fullyQualifiedName":" Running the Extension","identUtf16":{"start":{"lineNumber":200,"utf16Col":3},"end":{"lineNumber":200,"utf16Col":25}},"extentUtf16":{"start":{"lineNumber":200,"utf16Col":0},"end":{"lineNumber":217,"utf16Col":0}}},{"name":" Modifying the Extension","kind":"section_3","identStart":5808,"identEnd":5832,"extentStart":5805,"extentEnd":6085,"fullyQualifiedName":" Modifying the Extension","identUtf16":{"start":{"lineNumber":217,"utf16Col":3},"end":{"lineNumber":217,"utf16Col":27}},"extentUtf16":{"start":{"lineNumber":217,"utf16Col":0},"end":{"lineNumber":234,"utf16Col":0}}},{"name":" Testing","kind":"section_3","identStart":6088,"identEnd":6096,"extentStart":6085,"extentEnd":6766,"fullyQualifiedName":" Testing","identUtf16":{"start":{"lineNumber":234,"utf16Col":3},"end":{"lineNumber":234,"utf16Col":11}},"extentUtf16":{"start":{"lineNumber":234,"utf16Col":0},"end":{"lineNumber":257,"utf16Col":0}}},{"name":" License","kind":"section_2","identStart":6768,"identEnd":6776,"extentStart":6766,"extentEnd":6864,"fullyQualifiedName":" License","identUtf16":{"start":{"lineNumber":257,"utf16Col":2},"end":{"lineNumber":257,"utf16Col":10}},"extentUtf16":{"start":{"lineNumber":257,"utf16Col":0},"end":{"lineNumber":260,"utf16Col":0}}}]}},"copilotInfo":null,"csrf_tokens":{"/paradedb/paradedb/branches":{"post":"CIkOdypcS77rkRiP7Ca_QbpRCcJaqPbJb2MfgsEKqtDFJWggInc5WhvJ4sepfLfNGguBbHib5eT_XkZq0Ea6Lg"},"/repos/preferences":{"post":"t9Mz8it5u4ejrpfGz-i_yxLAF_9_XJOD5suIe4VLidYpRXdDVHyta-02A1-SOP3qfqeNIqgVUCkCoTAe62Aemg"}}},"title":"paradedb/pg_bm25/README.md at dev ยท paradedb/paradedb"}